In [1]:
# autoreload
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os

# Add the parent directory of src to the path
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.db.manager import DBManager
from src.input_to_instructions.load_and_execute import *
from src.input_to_instructions.types import *
from src.operation.execute import *
from src.response_generation.load_and_execute import *
from src.dateutils import normalize_sql_dates


INFO:datasets:PyTorch version 2.7.1+cu128 available.


ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!


In [3]:
from collections import defaultdict
import logging

import pandas as pd
import numpy as np
from tqdm import tqdm
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

import json
import itertools

# from db.manager import DBManager
from operation.execute import OperationExecutor
from pathlib import Path
import warnings
import datetime


warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)

In [4]:
BASE_DIR = "../"
def read_json(path):
    with open(path, "r", encoding="utf-8") as f:
        result = json.loads(f.read())
    
    # result = [{"Input": d["Input"], "Response": json.dumps(d["Response"], ensure_ascii=False)} for d in result]
    return result

In [5]:
if torch.cuda.get_device_capability()[0] >= 8:
    attn_implementation = "flash_attention_2"
    torch_dtype = torch.bfloat16
else:
    attn_implementation = "eager"
    torch_dtype = torch.float16
print(f"attn_implementation: {attn_implementation}, torch_dtype: {torch_dtype}")


attn_implementation: flash_attention_2, torch_dtype: torch.bfloat16


# Load 

In [6]:
# ResponseGeneration.update_prompt()

ResponseGeneration.initialize(
    log_output=False,
    instance_type="unsloth"
)
tokenizer = ResponseGeneration.tokenizer
print(tokenizer)

def measure_token_count(input: str) -> int:
    return len(tokenizer.encode(str(input)))

Are you certain you want to do remote code execution?
==((====))==  Unsloth 2025.8.1: Fast Llama patching. Transformers: 4.55.0.
   \\   /|    NVIDIA H100 80GB HBM3. Num GPUs = 1. Max memory: 79.19 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu128. CUDA: 9.0. CUDA Toolkit: 12.8. Triton: 3.3.1
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = True]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

sh2orc/Llama-3.1-Korean-8B-Instruct does not have a padding token! Will use pad_token = <|finetune_right_pad_id|>.
PreTrainedTokenizerFast(name_or_path='sh2orc/Llama-3.1-Korean-8B-Instruct', vocab_size=128000, model_max_length=131072, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<|begin_of_text|>', 'eos_token': '<|eot_id|>', 'pad_token': '<|finetune_right_pad_id|>'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	128000: AddedToken("<|begin_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128001: AddedToken("<|end_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128002: AddedToken("<|reserved_special_token_0|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128003: AddedToken("<|reserved_special_token_1|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128004: AddedToken("<|

In [7]:
import time
from src.input_to_instructions.types import InstructionQ_raw
def get_time(df, fmt="datetime"):
    # from df get 'timestamp' column and return them in format
    if fmt == "date":
        fmt = '%Y-%m-%d'
    elif fmt == "month":
        fmt = '%Y-%m'
    elif fmt == "year":
        fmt = '%Y'
    else:
        fmt = '%Y-%m-%d %H:%M:%S'
    print(f"get_time, col: {df.columns}, fmt: {fmt}")
    if isinstance(df['timestamp'], pd.Timestamp):
        result = df['timestamp'].strftime(fmt)
    else:
        result = df['timestamp'].apply(lambda x: x.strftime(fmt))
    return sorted(list(set(result)))

def get_spatials(df):
    return pd.unique(df['idu_name'])

def get_tv(df, col:str|list[str], fmt="datetime"):
    if isinstance(col, str):
        col = [col]
    
    timestamps = get_time(df, fmt)
    return_tuple = tuple([timestamps] + [df[c] for c in col])
    return return_tuple

def data_(metadata, mapping, query_results, t=str|list[str], s=str|list[str], m=str|list[str]):
    if isinstance(t, str):
        t = [t]
    if isinstance(s, str):
        s = [s]
    if isinstance(m, str):
        m = [m]

    t_raw = [mapping.temporal[t_highlevel] for t_highlevel in t]
    s_raw = [mapping.spatials[s_highlevel] for s_highlevel in s]
    m_raw = [mapping.modalities[m_highlevel] for m_highlevel in m]
    
    # flatten s_raw into a list of strings
    # flattened = [item for sublist in data for item in (sublist if isinstance(sublist, list) else [sublist])]
    s_raw = [item for sublist in s_raw for item in (sublist if isinstance(sublist, list) else [sublist])]
    # print(s_raw)
    result_df = DBManager.structured_query_data_t_v2(metadata, m_raw, t_raw, s_raw, get_rowids=True)
    
    cols = list(result_df.columns)
    print(f"cols: {cols}")
    cols.remove("id")
    cols.remove("idu_name")
    cols.remove("timestamp")
    rows = list(result_df["id"])
    query_results.append({
        "result_columns": cols,
        "result_indices": rows,
    })
    # print(cols, rows)

    # For demo, drop rows where any value is -1
    result_df = result_df.loc[(result_df != -1).all(axis=1)]

    # drop "id" from result_df
    result_df = result_df.drop(columns=['id'])

    # change column names to high level
    inverse_mapping = {v: k for k, v in mapping.modalities.items()}
    result_df.columns = [inverse_mapping[col] if col in inverse_mapping else col for col in result_df.columns]

    # change idu_name raw values to high level
    inverse_mapping = {}
    for k, v in mapping.spatials.items():
        if isinstance(v, list):
            for v_ in v:
                inverse_mapping[v_] = k
        else:
            inverse_mapping[v] = k

    result_df["idu_name"] = result_df["idu_name"].map(inverse_mapping)

    return result_df


def run_query_v2(user_input, metadata, mapping, expectations, required_variables, scripts, exp_tag=None):
    query_results = []
    variables = {}
    # print(f"exp_tag: {exp_tag}")
    if scripts is not None:

        # search data(t=~~, ...,)
        globals()['metadata'] = metadata
        globals()['mapping'] = mapping
        globals()['query_results'] = query_results
        for name in list(globals()):
            if name.startswith("v_"):
                del globals()[name]
        try:
            query_time = 0
            process_time = 0
            
            for script in scripts:
                try:
                    start_time = time.time()
                    if "data" in script:
                        script = script.replace("data(", "data_(metadata, mapping, query_results, ")
                    
                    if "SELECT" in script:
                        # split only at the first '=' to avoid issues with '=' in SQL
                        variable, sql = script.split("=", 1)
                        variable = variable.strip()
                        sql = sql.strip()
                        # get all between \" and \"
                        sql = re.findall(r'"(.*)"', sql)
                        sql = sql[0]
                        # "SELECT"ÎùºÎäî Ï≤´ Î≤àÏß∏ Îì±Ïû•Îßå "SELECT id "Î°ú ÎåÄÏ≤¥Ìï©ÎãàÎã§.
                        sql = sql.replace("SELECT", "SELECT id, ", 1)
                        df = DBManager.execute_structured_query_string(sql)
                        cols = list(df.columns)
                        cols.remove("id")
                        cols.remove("idu_name")
                        cols.remove("timestamp")
                        rows = list(df["id"])
                        query_results.append({
                            "result_columns": cols,
                            "result_indices": rows,
                        })
                        df = df.drop(columns=['id'])
                        globals()[variable] = df
                    else:
                        exec(script, globals())
                    
                    end_time = time.time()
                    if "data" in script:
                        query_time += end_time - start_time
                    else:
                        process_time += end_time - start_time
                except Exception as e:
                    print(f"Error in executing script: {script}")
                    print(e)
                    raise e

            start_time = time.time()
            variables = {name:globals()[name] for name in globals() if name.startswith("v_")}
            response, required_variables = ResponseGeneration.execute_v2(expectations, required_variables, variables, user_input, exp_tag=exp_tag)
            rg_last_input_token_length = measure_token_count(ResponseGeneration.last_input_str)
            rg_last_output_token_length = measure_token_count(response)
            print("rg_last_input_token_length,", rg_last_input_token_length, ",rg_last_output_token_length,", rg_last_output_token_length)
            
            response_generation_time = time.time() - start_time

            # print(f"ÏßàÎ¨∏: {user_input}, ÏøºÎ¶¨ Ïã§Ìñâ ÏãúÍ∞Ñ: {query_time:.4f}Ï¥à, ÌîÑÎ°úÏÑ∏Ïä§ Ïã§Ìñâ ÏãúÍ∞Ñ: {process_time:.4f}Ï¥à, ÏùëÎãµ ÏÉùÏÑ± ÏãúÍ∞Ñ: {response_generation_time:.4f}Ï¥à")
            return response, variables, required_variables, query_results
        except Exception as e:
            print(f"Error in running query_v2: {e}")
            return "Ïã§ÌñâÏ§ë ÏóêÎü¨Í∞Ä Î∞úÏÉùÌñàÏäµÎãàÎã§.", variables, None, query_results
    else:
        if exp_tag in ["woQM", "woQM+Script"]:
            response, required_variables = ResponseGeneration.execute_v2(expectations, required_variables, variables, user_input, exp_tag=exp_tag)
            return response, variables, required_variables, query_results
        else:
            variables = {}
            unknown_spatials = [k for k, v in mapping.spatials.items() if v == "Unknown"]
            unknown_modalities = [k for k, v in mapping.modalities.items() if v == "Unknown"]
            
            response_unknown = f"Ï£ÑÏÜ°Ìï©ÎãàÎã§, {unknown_spatials + unknown_modalities}Îäî Ï°¥Ïû¨ÌïòÏßÄ ÏïäÎäî Í≥µÍ∞ÑÏù¥ÎÇò Î™®Îã¨Î¶¨Ìã∞ ÏûÖÎãàÎã§."
            return response_unknown, variables, [], query_results


def run_query(user_input, metadata, instructions, exp_tag=None):
    variables = {
        "Metadata": metadata,
    }
    query_results = []
        
    
    for instruction in instructions:
        # logger.debug(f"Executing instruction: {instruction.__class__.__name__}")
        # print(f"Executing instruction: {instruction.__class__.__name__}")
        
        if type(instruction) == InstructionQ:
            # Execute query
            result_df = DBManager.structured_query_data_t(metadata, instruction.args, get_rowids=True)
            # if result_df is None:
                # print("Ï£ÑÏÜ°Ìï©ÎãàÎã§, Í¥ÄÎ†® Îç∞Ïù¥ÌÑ∞Î•º Ï∞æÏùÑ Ïàò ÏóÜÏäµÎãàÎã§.", "response")
                # return

            cols = list(result_df.columns)
            cols.remove("id")
            cols.remove("idu")
            rows = list(result_df["id"])

            query_results.append({
                "result_columns": cols,
                "result_indices": rows,
            })

            # For demo, drop rows where any value is -1
            result_df = result_df.loc[(result_df != -1).all(axis=1)]

            # drop "id" from result_df
            result_df = result_df.drop(columns=['id'])
           
            #pd.set_option('display.max_rows', 10000)        
            #pd.set_option('display.max_columns', 1000)
            #pd.set_option('display.width', 1000)
            #pd.set_option('display.max_colwidth', 1000)
            #print(f"QueryResult: {result_df}")

            variables[instruction.result_name] = result_df
        elif type(instruction) == InstructionQ_raw:
            instruction.query = instruction.query.replace(" FROM \"data_t\"", ", \"id\" FROM \"data_t\"")
            result_df = DBManager.execute_structured_query_string(
                instruction.query
            )
            # rename idu_name to idu
            result_df = result_df.rename(columns={'idu_name': 'idu'})
            
            cols = list(result_df.columns)
            cols.remove("id")
            cols.remove("idu")
            rows = list(result_df["id"])

            query_results.append({
                "result_columns": cols,
                "result_indices": rows,
            })

            # drop "id" from result_df
            result_df = result_df.drop(columns=['id'])
            
            variables[instruction.result_name] = result_df
            # print(result_df, flush=True)

        elif type(instruction) == InstructionO:
            # Execute operation
            # variables_to_report = {k: v for k, v in variables.items() if k not in ["Metadata"]}
            # print(variables_to_report)
            result_dict = OperationExecutor.execute(variables, instruction.scripts)
            # print(instruction.scripts, instruction.returns, result_dict)
            variables.update(result_dict)
            pass
            # print(fig, "graph")
        elif type(instruction) == InstructionR:
            # Execute response generation
            variables_to_report = {k: v for k, v in variables.items() if k not in ["Metadata"]}
            # print(variables_to_report)
            # variables_to_report = ResponseGeneration.stringify_variables(variables_to_report)
            # variables_to_report = summarize_variables_to_report(variables_to_report)

            # print(f"Variables: {variables_to_report}")

            keys_to_leave = ["modality_mapping", "idu_mapping"]
            metadata_ = {}
            for key in metadata.keys():
                if key in keys_to_leave:
                    metadata_[key] = metadata[key]

            response, required_variables = ResponseGeneration.execute(instruction, variables, user_input, metadata_, exp_tag=exp_tag)
            # print(f"Required variables: {required_variables}")
            
            # response = instruction.expectations[0] # "{{var}}..."
            # for var_name, var_value in required_variables.items():
            #     placeholder = f"{{{{{var_name}}}}}"
            #     if placeholder in response:
            #         response = response.replace(placeholder, str(var_value))

            
            return response, variables_to_report, required_variables, query_results

# Eval query

In [8]:
import time
from typing import Any  # Any ÌÉÄÏûÖ import ÌïÑÏöî
from copy import deepcopy
class EM:
    json_structure = "JsonStructureCorrectness"
    true_positive = "QueryTruePositive"
    false_positive = "QueryFalsePositive"
    false_negative = "QueryFalseNegative"
    
def eval_query(cand_response_filename, db_gt_filename="./gts.json"):
    db_gts = read_json(db_gt_filename)
    cand_responses = read_json(cand_response_filename)
    # metadata_ = read_json(f"{BASE_DIR}/finetuning/dataset/v7-250309-reduceinputanddatefunctioncall/scenario1/metadata.json")
    evaluation_reports = []
    response_reports = []
    time_reports = []
    with tqdm(total=len(cand_responses)) as pbar:
        for cand_response in cand_responses:
            # pbar.set_description(f"Processing {cand_response['Input']}")
            input = cand_response["Input"]
            scenario = cand_response["Scenario"]

            # if "ÏòÜÎ∞ò ÏäµÎèÑ ÏïåÎ†§Ï§ò" not in input:
            #     continue

            if "Metadata" in cand_response:
                metadata = cand_response["Metadata"]
            else:
                # metadata = metadata_
                metadata = None
            # Í¥ÄÍ≥Ñ ÏóÜÎäî ÏßàÎ¨∏Îì§ÏùÄ Í±¥ÎÑàÎõ∞Ïûê
            gt_report = [d for d in db_gts if d["Input"] == input and d["Scenario"] == scenario]
            assert len(gt_report) <= 1
            if len(gt_report) == 0:
                print(f"No ground truth found for {input}")
                pbar.update(1)
                continue

            gt_report = gt_report[0]
            tags = gt_report["Tags"]
            # assert gt_report["QueryResults"] != []
            # if gt_report["Result"] == []:
            #     pbar.update(1)
            #     continue
            
            
            gt_results = [d for d in gt_report["QueryResults"]]
            gt_query_results = defaultdict(list)
            for gt_result in gt_results:
                for col in gt_result["result_columns"]:
                    gt_query_results[col].extend(gt_result["result_indices"])

            gt_total_combinations = sum(len(v) for v in gt_query_results.values())

            gt_response = gt_report["Response"]
            # gt_required_variables = gt_report["RequiredVariables"]
            # gt_variables_to_report = gt_report["VariablesToReport"]
            user_input = gt_report["Input"]
            # print(user_input)
            exp_tag = cand_response_filename.split("/")[-1].split("_")[3]
            print(cand_response_filename, exp_tag)
            response_report = {
                "Input": user_input,
                "Metadata": metadata,
                "GT_Response": gt_response,
                # "GT_RequiredVariables": gt_required_variables,
                # "GT_VariablesToReport": gt_variables_to_report,
            }
            # evaluation_report ÎîïÏÖîÎÑàÎ¶¨ ÏÉùÏÑ± (defaultdict ÏÇ¨Ïö©, Í∏∞Î≥∏Í∞í None)

            evaluation_report: dict[str, Any] = defaultdict(lambda: None)
            evaluation_report["Input"] = input
            evaluation_report["Metadata"] = metadata
            evaluation_report["Tags"] = tags

            
            
            if isinstance(cand_response["Candidate"], dict):
                requirements = ["Thinking", "Expectations", "Mapping"]
                if exp_tag in ["WoThinking", "WoMetadata+Thinking"]:
                    requirements.remove("Thinking")
                elif exp_tag in ["woExp"]:
                    requirements.remove("Expectations")
                elif exp_tag in ["woQM", "woQM+Script"]:
                    requirements.remove("Mapping")
                for requirement in requirements:
                    if requirement not in cand_response["Candidate"]:
                        evaluation_report[EM.json_structure] = False
                        break
                else:
                    evaluation_report[EM.json_structure] = True
            else:
                evaluation_report[EM.json_structure] = False
            
            if not evaluation_report[EM.json_structure]:
                evaluation_report[EM.true_positive] = 0
                evaluation_report[EM.false_positive] = 0
                evaluation_report[EM.false_negative] = gt_total_combinations

                print("Failed to parse input: ", input, cand_response["Candidate"])
                evaluation_reports.append(evaluation_report)
                pbar.update(1)
                response_reports.append(response_report)
                continue
            
            start_time = time.time()
            expertLLM_output_token_length = measure_token_count(cand_response["Candidate"])
            print("Input,", cand_response["Input"], ",expertLLM_output_tlen,",  expertLLM_output_token_length)

            if exp_tag in ["woExp"]:
                cand_response["Candidate"]["Expectations"] = []
            if exp_tag in ["woQM", "woQM+Script"]:
                pass
            # exp_tag = \
            #     "woCoTExp" if "woCoTExp" in str(cand_response_filename) else \
            #     "woOp" if "woOp" in str(cand_response_filename) else \
            #     "woQM" if "woQM" in str(cand_response_filename) else \
            #     None
            try:
                mapping, expectations, required_variables, script = InputToInstruction.postprocess_v2(
                    deepcopy(cand_response["Candidate"]), 
                    exp_tag=exp_tag
                )
            except:
                evaluation_report[EM.true_positive] = 0
                evaluation_report[EM.false_positive] = 0
                evaluation_report[EM.false_negative] = gt_total_combinations
                            
                evaluation_reports.append(evaluation_report)
                pbar.update(1)
                response_reports.append(response_report)
                continue
            

            
            response, variables_to_report, required_variables, _cand_query_results = run_query_v2(user_input, metadata, mapping, expectations, required_variables, script, exp_tag=exp_tag)
            # print(response)
            response_report["PD_Response"] = response
            # try:
            #     # response, variables_to_report, required_variables, _cand_query_results = run_query_v2(user_input, metadata, instructions, exp_tag=exp_tag)
            # except Exception as e:
            #     print(f"Error: {e}")
            #     # evaluation_report[EM.true_positive] = 0
            #     # evaluation_report[EM.false_positive] = 0
            #     # evaluation_report[EM.false_negative] = gt_total_combinations
                            
            #     # evaluation_reports.append(evaluation_report)

            #     # response_reports.append(response_report)
                            
            #     # pbar.update(1)
            #     # continue
            time_reports.append(time.time() - start_time)
            response_reports.append(response_report)
            
            # required_variables = summarize_variables_to_report(required_variables)
            # print(required_variables)
            # required_variables = ResponseGeneration.stringify_variables(required_variables)
            
            # response_report["PD_RequiredVariables"] = required_variables
            # response_report["PD_VariablesToReport"] = variables_to_report

            if len(_cand_query_results) == 0:
                evaluation_report[EM.true_positive] = 0
                evaluation_report[EM.false_positive] = 0
                evaluation_report[EM.false_negative] = gt_total_combinations
                            
                evaluation_reports.append(evaluation_report)
                pbar.update(1)
                continue
            
            cand_query_results = defaultdict(list)
            for cand_query_result in _cand_query_results:
                for col in cand_query_result["result_columns"]:
                    cand_query_results[col].extend(cand_query_result["result_indices"])

            cand_total_combinations = sum(len(v) for v in gt_query_results.values())

            if len(gt_results) == 0:
                evaluation_report[EM.true_positive] = 0
                evaluation_report[EM.false_positive] = cand_total_combinations
                evaluation_report[EM.false_negative] = 0

                evaluation_reports.append(evaluation_report)
                pbar.update(1)

                continue
            
            # print(gt_total_combinations, cand_total_combinations)
            # True Positive: Í≥µÌÜµÎêú Ïª¨ÎüºÍ≥º Î°úÏö∞Ïùò Î™®Îì† Ï°∞Ìï©
            true_positive = 0
            false_negative = 0
            false_positive = 0
            for col in set(gt_query_results.keys())&set(cand_query_results.keys()):
                s_gt_query_result = set(gt_query_results[col])
                s_cand_query_result = set(cand_query_results[col])
                true_positive += len(s_gt_query_result & s_cand_query_result)
                false_negative += len(s_gt_query_result - s_cand_query_result)
                false_positive += len(s_cand_query_result - s_gt_query_result)

                # print(true_positive, false_negative, false_positive, len(s_gt_query_result), len(s_cand_query_result))
            # assert true_positive + false_positive + false_negative == gt_total_combinations
            

            evaluation_report[EM.true_positive] = true_positive
            evaluation_report[EM.false_positive] = false_positive
            evaluation_report[EM.false_negative] = false_negative

            evaluation_reports.append(evaluation_report)
            # print(evaluation_report)
            
            pbar.update(1)

    with open(f"{cand_response_filename.replace('.json', '_response.json')}", "w", encoding="utf-8") as f:
        json.dump(response_reports, f, ensure_ascii=False, indent=4)

    print(f"Time: {time_reports}, {sum(time_reports) / len(time_reports)}")

    eval_df = pd.DataFrame(evaluation_reports)
    # print(eval_df)

    eval_df['ExactMatch'] = eval_df.apply(lambda x: x[EM.false_positive] == 0 and x[EM.false_negative] == 0, axis=1).astype(int)
    # eval_df['TruePositive'] = eval_df['TruePositive'].astype(int)
    # eval_df['FalsePositive'] = eval_df['FalsePositive'].astype(int)
    # eval_df['FalseNegative'] = eval_df['FalseNegative'].astype(int)

    final_result = {}

    for col in ["JsonStructureCorrectness", "ExactMatch"]:
        # print(f"{col}: {eval_df[col].mean()}")
        final_result[col] = eval_df[col].mean()
    
    # normalize per query
    eval_df["Total"] = eval_df[EM.true_positive] + eval_df[EM.false_positive] + eval_df[EM.false_negative]
    eval_print = eval_df.drop(columns=["Metadata", "Tags"])
    print(eval_print)
    eval_df[EM.true_positive] = eval_df[EM.true_positive] / eval_df["Total"]
    eval_df[EM.false_positive] = eval_df[EM.false_positive] / eval_df["Total"]
    eval_df[EM.false_negative] = eval_df[EM.false_negative] / eval_df["Total"]

    # # replace nan with 0
    # eval_df.fillna(0, inplace=True)

    # # F1 score except nans.
    truepos_sum, falsepos_sum, falseneg_sum = eval_df[EM.true_positive].sum(), eval_df[EM.false_positive].sum(), eval_df[EM.false_negative].sum()
    precision = truepos_sum / (truepos_sum + falsepos_sum)
    recall = truepos_sum / (truepos_sum + falseneg_sum)
    print(truepos_sum, falsepos_sum, falseneg_sum)
    print(precision, recall)
    f1 = 2 * (precision * recall) / (precision + recall)
    # print(f"F1: {f1}")
    final_result["F1"] = f1
    final_result["Recall"] = recall

    for col in final_result:
        print(f"{col}: {final_result[col]:.2f}")
    
    return eval_df

# RUN eval

In [9]:
# name = "r-v7_r256_a512_ours_tr6_0503-checkpoint-63"
# name = "r-v7_r256_a512_ours_tr18_0503-checkpoint-52"
# name = "r-v7_r256_a512_ours_tr30_0503-checkpoint-54"
# name = "r-v7_r256_a512_ours_tr45_0503-checkpoint-95"
# name = "r-v7_r256_a512_ours_tr60_0503-checkpoint-108"

# name = "r-v7_r256_a512_woall_tr6_0503-checkpoint-28"
# name = "r-v7_r256_a512_woall_tr18_0503-checkpoint-70"
# name = "r-v7_r256_a512_woall_tr30_0503-checkpoint-57"
# name = "r-v7_r256_a512_woall_tr45_0503-checkpoint-95"
# name = "r-v7_r256_a512_woall_tr60_0503-checkpoint-90"

names = [
"r-3rdyear_r211_a422_sh2orc-Llama-3.1-Korean-8B-Instruct_tr27_4bit-step-41",
"r-3rdyear_r211_a422_sh2orc-Llama-3.1-Korean-8B-Instruct_tr27_8bit-step-41",
"r-3rdyear_r211_a422_sh2orc-Llama-3.1-Korean-8B-Instruct_tr27_16bit-step-41",
"r-3rdyear_r450_a900_Bllossom-llama-3.2-Korean-Bllossom-3B_tr27_16bit-step-70",
]

for name in names:
    eval_query(
        f"../experiments/result_3rdyear/{name}.json"
    )

  0%|          | 0/12 [00:00<?, ?it/s]

../experiments/result_3rdyear/r-3rdyear_r211_a422_sh2orc-Llama-3.1-Korean-8B-Instruct_tr27_4bit-step-41.json sh2orc-Llama-3.1-Korean-8B-Instruct
Input, Ïù¥Î≤àÏ£º Ïö∞Î¶¨Î∞òÍ≥º ÏïûÎ∞òÏùò ÌèâÍ∑† Ïò®ÎèÑ ÏïåÎ†§Ï§ò ,expertLLM_output_tlen, 456
cols: ['roomtemp', 'idu_name', 'id', 'timestamp']


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


cols: ['roomtemp', 'idu_name', 'id', 'timestamp']


  8%|‚ñä         | 1/12 [00:02<00:29,  2.65s/it]

rg_last_input_token_length, 221 ,rg_last_output_token_length, 57
../experiments/result_3rdyear/r-3rdyear_r211_a422_sh2orc-Llama-3.1-Korean-8B-Instruct_tr27_4bit-step-41.json sh2orc-Llama-3.1-Korean-8B-Instruct
Input, ÌòÑÏû¨ ÏÑ§Ï†ïÏò®ÎèÑÎûë Ïã§ÎÇ¥Ïò®ÎèÑ Ï∞®Ïù¥ ÏïåÎ†§Ï§ò. ,expertLLM_output_tlen, 333
cols: ['roomtemp', 'idu_name', 'settemp', 'id', 'timestamp']


 17%|‚ñà‚ñã        | 2/12 [00:03<00:14,  1.45s/it]

rg_last_input_token_length, 149 ,rg_last_output_token_length, 35
../experiments/result_3rdyear/r-3rdyear_r211_a422_sh2orc-Llama-3.1-Korean-8B-Instruct_tr27_4bit-step-41.json sh2orc-Llama-3.1-Korean-8B-Instruct
Input, ÏßÄÎÇúÎã¨Ïóê ÏÑ§Ï†ïÏò®ÎèÑÏôÄ Ïã§ÎÇ¥Ïò®ÎèÑ Ï∞®Ïù¥Í∞Ä Í∞ÄÏû• ÎßéÏù¥ ÎÇ¨Îçò ÎÇ†ÏùÄ? ,expertLLM_output_tlen, 746
cols: ['idu_name', 'settemp', 'id', 'timestamp']


 25%|‚ñà‚ñà‚ñå       | 3/12 [00:05<00:16,  1.79s/it]

cols: ['roomtemp', 'idu_name', 'id', 'timestamp']
get_time, col: Index(['ÏÑ§Ï†ïÏò®ÎèÑ_Ïã§ÎÇ¥Ïò®ÎèÑ_Ï∞®Ïù¥'], dtype='object'), fmt: %Y-%m-%d
Error in executing script: v_ÏßÄÎÇúÎã¨_ÏÑ§Ï†ïÏò®ÎèÑ_Ïã§ÎÇ¥Ïò®ÎèÑ_Ï∞®Ïù¥_ÏµúÍ≥†_ÎÇ†Ïßú = get_time(v_ÏßÄÎÇúÎã¨_ÏÑ§Ï†ïÏò®ÎèÑ_Ïã§ÎÇ¥Ïò®ÎèÑ_Ï∞®Ïù¥_ÏµúÍ≥†_df, fmt='date')
'timestamp'
Error in running query_v2: 'timestamp'
../experiments/result_3rdyear/r-3rdyear_r211_a422_sh2orc-Llama-3.1-Korean-8B-Instruct_tr27_4bit-step-41.json sh2orc-Llama-3.1-Korean-8B-Instruct
Input, Ïù¥Î≤àÏ£º Ïö∞Î¶¨Î∞òÍ≥º ÏòÜÎ∞òÏùò ÌèâÍ∑† Ïã§ÎÇ¥Ïò®ÎèÑ Ï∞®Ïù¥ ÏïåÎ†§Ï§ò ,expertLLM_output_tlen, 444
cols: ['roomtemp', 'idu_name', 'id', 'timestamp']
cols: ['roomtemp', 'idu_name', 'id', 'timestamp']


 33%|‚ñà‚ñà‚ñà‚ñé      | 4/12 [00:08<00:17,  2.15s/it]

rg_last_input_token_length, 213 ,rg_last_output_token_length, 45
../experiments/result_3rdyear/r-3rdyear_r211_a422_sh2orc-Llama-3.1-Korean-8B-Instruct_tr27_4bit-step-41.json sh2orc-Llama-3.1-Korean-8B-Instruct
Input, 2Ï£ºÏ†Ñ Í∞ÄÏû• ÎçîÏõ†Îçò ÎÇ† ÏïåÎ†§Ï§ò ,expertLLM_output_tlen, 363
cols: ['roomtemp', 'idu_name', 'id', 'timestamp']
get_time, col: Index(['Ïã§ÎÇ¥Ïò®ÎèÑ', 'idu_name', 'timestamp'], dtype='object'), fmt: %Y-%m-%d


 42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 5/12 [00:09<00:12,  1.73s/it]

rg_last_input_token_length, 142 ,rg_last_output_token_length, 29
../experiments/result_3rdyear/r-3rdyear_r211_a422_sh2orc-Llama-3.1-Korean-8B-Instruct_tr27_4bit-step-41.json sh2orc-Llama-3.1-Korean-8B-Instruct
Input, ÌôîÏÑ±Ïùò ÏÑ§Ï†ïÏò®ÎèÑ ÌôïÏù∏Ìï¥Ï§ò ,expertLLM_output_tlen, 105
../experiments/result_3rdyear/r-3rdyear_r211_a422_sh2orc-Llama-3.1-Korean-8B-Instruct_tr27_4bit-step-41.json sh2orc-Llama-3.1-Korean-8B-Instruct
Input, ÏòÜÎ∞ò ÏäµÎèÑ ÏïåÎ†§Ï§ò ,expertLLM_output_tlen, 109
../experiments/result_3rdyear/r-3rdyear_r211_a422_sh2orc-Llama-3.1-Korean-8B-Instruct_tr27_4bit-step-41.json sh2orc-Llama-3.1-Korean-8B-Instruct
Input, ÏßÄÎÇú 3Ïùº ÎèôÏïà Ïö∞Î¶¨Î∞ò Ïã§ÎÇ¥ Ïò®ÎèÑ ÌèâÍ∑† Í∞í ÏïåÎ†§Ï§ò. ,expertLLM_output_tlen, 245
cols: ['roomtemp', 'idu_name', 'id', 'timestamp']


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 8/12 [00:09<00:03,  1.25it/s]

rg_last_input_token_length, 98 ,rg_last_output_token_length, 23
../experiments/result_3rdyear/r-3rdyear_r211_a422_sh2orc-Llama-3.1-Korean-8B-Instruct_tr27_4bit-step-41.json sh2orc-Llama-3.1-Korean-8B-Instruct
Input, Ïò§Îäò Ïò§ÌõÑ 5ÏãúÏóê ÏòÜÎ∞òÏùò ÏÑ§Ï†ïÏò®ÎèÑÎäî Ïñ¥Îï†Ïñ¥? ,expertLLM_output_tlen, 248
cols: ['idu_name', 'settemp', 'id', 'timestamp']


 75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 9/12 [00:10<00:02,  1.39it/s]

rg_last_input_token_length, 97 ,rg_last_output_token_length, 22
../experiments/result_3rdyear/r-3rdyear_r211_a422_sh2orc-Llama-3.1-Korean-8B-Instruct_tr27_4bit-step-41.json sh2orc-Llama-3.1-Korean-8B-Instruct
Input, Ïò¨Ìï¥ Ïó¨Î¶Ñ Ïö∞Î¶¨Î∞ò Ïã§ÎÇ¥Ïò®ÎèÑ ÏµúÎåÄÍ∞íÍ≥º ÏµúÏÜåÍ∞í ÏïåÎ†§Ï§ò ,expertLLM_output_tlen, 318
cols: ['roomtemp', 'idu_name', 'id', 'timestamp']


 83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 10/12 [00:12<00:02,  1.05s/it]

rg_last_input_token_length, 149 ,rg_last_output_token_length, 33
../experiments/result_3rdyear/r-3rdyear_r211_a422_sh2orc-Llama-3.1-Korean-8B-Instruct_tr27_4bit-step-41.json sh2orc-Llama-3.1-Korean-8B-Instruct
Failed to parse input:  Ïö∞Î¶¨Î∞òÍ≥º ÏïûÎ∞ò Ï§ë Í∞ÄÏû• ÎçîÏö¥ Î∞©ÏùÄ? {"Thinking": "ÏÇ¨Ïö©ÏûêÎäî ÌòÑÏû¨ Ïö∞Î¶¨Î∞òÍ≥º ÏïûÎ∞ò Ï§ë Ïã§ÎÇ¥Ïò®ÎèÑÍ∞Ä Îçî ÎÜíÏùÄ Í≥≥ÏùÑ ÏïåÍ≥†Ïã∂Ïñ¥Ìï®. ÌòÑÏû¨ Ïö∞Î¶¨Î∞òÍ≥º ÏïûÎ∞òÏùò Ïã§ÎÇ¥Ïò®ÎèÑÎ•º ÏøºÎ¶¨Ìïú ÌõÑ Îçî ÎÜíÏùÄ Ïã§ÎÇ¥Ïò®ÎèÑÎ•º Í∞ÄÏßÑ Î∞©Í≥º Í∑∏ Ïò®ÎèÑÎ•º Î∞òÌôòÌïòÎ©¥ Îê®.", "Expectations": ["{{ÌòÑÏû¨_Ïã§ÎÇ¥Ïò®ÎèÑ_ÏµúÍ≥†_Í≥µÍ∞Ñ}}({{ÌòÑÏû¨_Ïã§ÎÇ¥Ïò®ÎèÑ_ÏµúÍ≥†_Í≥µÍ∞Ñ_Ïã§ÎÇ¥Ïò®ÎèÑ}}‚ÑÉ)Ïù¥ {{ÌòÑÏû¨_Ïã§ÎÇ¥Ïò®ÎèÑ_ÏµúÍ≥†_Í≥µÍ∞ÑÎ≥Ñ_Í≥µÍ∞Ñ}}Î≥¥Îã§ {{ÌòÑÏû¨_Ïã§ÎÇ¥Ïò®ÎèÑ_ÏµúÍ≥†_Í≥µÍ∞ÑÎ≥Ñ_Í≥µÍ∞Ñ_Ï∞®Ïù¥}}‚ÑÉ Îçî ÎÜíÏäµÎãàÎã§."], "Mapping": {"temporal": {"ÌòÑÏû¨": "LAST_RECORD"}, "spatials": {"Ïö∞Î¶¨Î∞ò": "02_I81", "ÏïûÎ∞ò": "01_IB7"}, "modalities": {"Ïã§ÎÇ¥Ïò®ÎèÑ": "roomtemp"}}, "Script": ["v_ÌòÑÏû¨_Ïö∞Î¶¨Î∞ò_Ïã§ÎÇ¥Ïò®ÎèÑ_df = data(t='ÌòÑÏû¨',s='Ïö∞Î¶¨Î∞

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 12/12 [00:12<00:00,  1.06s/it]


rg_last_input_token_length, 82 ,rg_last_output_token_length, 19
Time: [2.6505610942840576, 0.6098086833953857, 2.1940596103668213, 2.70094895362854, 0.9671032428741455, 0.00032258033752441406, 0.0001811981201171875, 0.6127476692199707, 0.4335818290710449, 2.0798933506011963, 0.405653715133667], 1.1504419933665881
                               Input  JsonStructureCorrectness  \
0             Ïù¥Î≤àÏ£º Ïö∞Î¶¨Î∞òÍ≥º ÏïûÎ∞òÏùò ÌèâÍ∑† Ïò®ÎèÑ ÏïåÎ†§Ï§ò                      True   
1              ÌòÑÏû¨ ÏÑ§Ï†ïÏò®ÎèÑÎûë Ïã§ÎÇ¥Ïò®ÎèÑ Ï∞®Ïù¥ ÏïåÎ†§Ï§ò.                      True   
2   ÏßÄÎÇúÎã¨Ïóê ÏÑ§Ï†ïÏò®ÎèÑÏôÄ Ïã§ÎÇ¥Ïò®ÎèÑ Ï∞®Ïù¥Í∞Ä Í∞ÄÏû• ÎßéÏù¥ ÎÇ¨Îçò ÎÇ†ÏùÄ?                      True   
3        Ïù¥Î≤àÏ£º Ïö∞Î¶¨Î∞òÍ≥º ÏòÜÎ∞òÏùò ÌèâÍ∑† Ïã§ÎÇ¥Ïò®ÎèÑ Ï∞®Ïù¥ ÏïåÎ†§Ï§ò                      True   
4                   2Ï£ºÏ†Ñ Í∞ÄÏû• ÎçîÏõ†Îçò ÎÇ† ÏïåÎ†§Ï§ò                      True   
5                      ÌôîÏÑ±Ïùò ÏÑ§Ï†ïÏò®ÎèÑ ÌôïÏù∏Ìï¥Ï§ò                      True   
6                      

  0%|          | 0/12 [00:00<?, ?it/s]

../experiments/result_3rdyear/r-3rdyear_r211_a422_sh2orc-Llama-3.1-Korean-8B-Instruct_tr27_8bit-step-41.json sh2orc-Llama-3.1-Korean-8B-Instruct
Input, Ïù¥Î≤àÏ£º Ïö∞Î¶¨Î∞òÍ≥º ÏïûÎ∞òÏùò ÌèâÍ∑† Ïò®ÎèÑ ÏïåÎ†§Ï§ò ,expertLLM_output_tlen, 362
cols: ['roomtemp', 'idu_name', 'id', 'timestamp']
cols: ['roomtemp', 'idu_name', 'id', 'timestamp']


  8%|‚ñä         | 1/12 [00:01<00:13,  1.24s/it]

rg_last_input_token_length, 153 ,rg_last_output_token_length, 36
../experiments/result_3rdyear/r-3rdyear_r211_a422_sh2orc-Llama-3.1-Korean-8B-Instruct_tr27_8bit-step-41.json sh2orc-Llama-3.1-Korean-8B-Instruct
Input, ÌòÑÏû¨ ÏÑ§Ï†ïÏò®ÎèÑÎûë Ïã§ÎÇ¥Ïò®ÎèÑ Ï∞®Ïù¥ ÏïåÎ†§Ï§ò. ,expertLLM_output_tlen, 379
cols: ['idu_name', 'settemp', 'id', 'timestamp']
cols: ['roomtemp', 'idu_name', 'id', 'timestamp']


 17%|‚ñà‚ñã        | 2/12 [00:01<00:09,  1.04it/s]

rg_last_input_token_length, 150 ,rg_last_output_token_length, 36
../experiments/result_3rdyear/r-3rdyear_r211_a422_sh2orc-Llama-3.1-Korean-8B-Instruct_tr27_8bit-step-41.json sh2orc-Llama-3.1-Korean-8B-Instruct
Input, ÏßÄÎÇúÎã¨Ïóê ÏÑ§Ï†ïÏò®ÎèÑÏôÄ Ïã§ÎÇ¥Ïò®ÎèÑ Ï∞®Ïù¥Í∞Ä Í∞ÄÏû• ÎßéÏù¥ ÎÇ¨Îçò ÎÇ†ÏùÄ? ,expertLLM_output_tlen, 782
Error in executing script: v_ÏßÄÎÇúÎã¨_ÏÑ§Ï†ïÏò®ÎèÑ_df = data_(metadata, mapping, query_results, t='ÏßÄÎÇúÎã¨',s='Ïö∞Î¶¨Î∞ò,ÏïûÎ∞ò,ÏòÜÎ∞ò',m='ÏÑ§Ï†ïÏò®ÎèÑ')
'Ïö∞Î¶¨Î∞ò,ÏïûÎ∞ò,ÏòÜÎ∞ò'
Error in running query_v2: 'Ïö∞Î¶¨Î∞ò,ÏïûÎ∞ò,ÏòÜÎ∞ò'
../experiments/result_3rdyear/r-3rdyear_r211_a422_sh2orc-Llama-3.1-Korean-8B-Instruct_tr27_8bit-step-41.json sh2orc-Llama-3.1-Korean-8B-Instruct
Input, Ïù¥Î≤àÏ£º Ïö∞Î¶¨Î∞òÍ≥º ÏòÜÎ∞òÏùò ÌèâÍ∑† Ïã§ÎÇ¥Ïò®ÎèÑ Ï∞®Ïù¥ ÏïåÎ†§Ï§ò ,expertLLM_output_tlen, 443
cols: ['roomtemp', 'idu_name', 'id', 'timestamp']
cols: ['roomtemp', 'idu_name', 'id', 'timestamp']


 33%|‚ñà‚ñà‚ñà‚ñé      | 4/12 [00:03<00:06,  1.22it/s]

rg_last_input_token_length, 221 ,rg_last_output_token_length, 53
../experiments/result_3rdyear/r-3rdyear_r211_a422_sh2orc-Llama-3.1-Korean-8B-Instruct_tr27_8bit-step-41.json sh2orc-Llama-3.1-Korean-8B-Instruct
Input, 2Ï£ºÏ†Ñ Í∞ÄÏû• ÎçîÏõ†Îçò ÎÇ† ÏïåÎ†§Ï§ò ,expertLLM_output_tlen, 421
cols: ['roomtemp', 'idu_name', 'id', 'timestamp']
get_time, col: Index(['Ïã§ÎÇ¥Ïò®ÎèÑ', 'idu_name', 'timestamp'], dtype='object'), fmt: %Y-%m-%d


 42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 5/12 [00:04<00:06,  1.01it/s]

rg_last_input_token_length, 58 ,rg_last_output_token_length, 17
../experiments/result_3rdyear/r-3rdyear_r211_a422_sh2orc-Llama-3.1-Korean-8B-Instruct_tr27_8bit-step-41.json sh2orc-Llama-3.1-Korean-8B-Instruct
Input, ÌôîÏÑ±Ïùò ÏÑ§Ï†ïÏò®ÎèÑ ÌôïÏù∏Ìï¥Ï§ò ,expertLLM_output_tlen, 182
cols: ['idu_name', 'settemp', 'id', 'timestamp']


 50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 6/12 [00:05<00:04,  1.25it/s]

rg_last_input_token_length, 71 ,rg_last_output_token_length, 16
../experiments/result_3rdyear/r-3rdyear_r211_a422_sh2orc-Llama-3.1-Korean-8B-Instruct_tr27_8bit-step-41.json sh2orc-Llama-3.1-Korean-8B-Instruct
Input, ÏòÜÎ∞ò ÏäµÎèÑ ÏïåÎ†§Ï§ò ,expertLLM_output_tlen, 109
../experiments/result_3rdyear/r-3rdyear_r211_a422_sh2orc-Llama-3.1-Korean-8B-Instruct_tr27_8bit-step-41.json sh2orc-Llama-3.1-Korean-8B-Instruct
Input, ÏßÄÎÇú 3Ïùº ÎèôÏïà Ïö∞Î¶¨Î∞ò Ïã§ÎÇ¥ Ïò®ÎèÑ ÌèâÍ∑† Í∞í ÏïåÎ†§Ï§ò. ,expertLLM_output_tlen, 248
cols: ['roomtemp', 'idu_name', 'id', 'timestamp']


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 8/12 [00:05<00:02,  1.67it/s]

rg_last_input_token_length, 98 ,rg_last_output_token_length, 23
../experiments/result_3rdyear/r-3rdyear_r211_a422_sh2orc-Llama-3.1-Korean-8B-Instruct_tr27_8bit-step-41.json sh2orc-Llama-3.1-Korean-8B-Instruct
Input, Ïò§Îäò Ïò§ÌõÑ 5ÏãúÏóê ÏòÜÎ∞òÏùò ÏÑ§Ï†ïÏò®ÎèÑÎäî Ïñ¥Îï†Ïñ¥? ,expertLLM_output_tlen, 247
cols: ['idu_name', 'settemp', 'id', 'timestamp']


 75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 9/12 [00:06<00:01,  1.80it/s]

rg_last_input_token_length, 97 ,rg_last_output_token_length, 22
../experiments/result_3rdyear/r-3rdyear_r211_a422_sh2orc-Llama-3.1-Korean-8B-Instruct_tr27_8bit-step-41.json sh2orc-Llama-3.1-Korean-8B-Instruct
Input, Ïò¨Ìï¥ Ïó¨Î¶Ñ Ïö∞Î¶¨Î∞ò Ïã§ÎÇ¥Ïò®ÎèÑ ÏµúÎåÄÍ∞íÍ≥º ÏµúÏÜåÍ∞í ÏïåÎ†§Ï§ò ,expertLLM_output_tlen, 320
cols: ['roomtemp', 'idu_name', 'id', 'timestamp']


 83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 10/12 [00:08<00:01,  1.05it/s]

rg_last_input_token_length, 149 ,rg_last_output_token_length, 33
../experiments/result_3rdyear/r-3rdyear_r211_a422_sh2orc-Llama-3.1-Korean-8B-Instruct_tr27_8bit-step-41.json sh2orc-Llama-3.1-Korean-8B-Instruct
Input, Ïö∞Î¶¨Î∞òÍ≥º ÏïûÎ∞ò Ï§ë Í∞ÄÏû• ÎçîÏö¥ Î∞©ÏùÄ? ,expertLLM_output_tlen, 444
cols: ['roomtemp', 'idu_name', 'id', 'timestamp']
cols: ['roomtemp', 'idu_name', 'id', 'timestamp']


 92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 11/12 [00:09<00:00,  1.17it/s]

rg_last_input_token_length, 129 ,rg_last_output_token_length, 23
../experiments/result_3rdyear/r-3rdyear_r211_a422_sh2orc-Llama-3.1-Korean-8B-Instruct_tr27_8bit-step-41.json sh2orc-Llama-3.1-Korean-8B-Instruct
Input, ÏßÄÍ∏à 4Ï∏µ ÌèâÍ∑† Ïã§ÎÇ¥Ïò®ÎèÑ ÏïåÎ†§Ï§ò ,expertLLM_output_tlen, 216
cols: ['roomtemp', 'idu_name', 'id', 'timestamp']


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 12/12 [00:09<00:00,  1.27it/s]


rg_last_input_token_length, 82 ,rg_last_output_token_length, 19
Time: [1.2384097576141357, 0.7592594623565674, 0.0006930828094482422, 1.4721522331237793, 1.378847360610962, 0.36470890045166016, 0.0002384185791015625, 0.7318768501281738, 0.433943510055542, 2.073249340057373, 0.5949814319610596, 0.4005870819091797], 0.7874122858047485
                               Input  JsonStructureCorrectness  \
0             Ïù¥Î≤àÏ£º Ïö∞Î¶¨Î∞òÍ≥º ÏïûÎ∞òÏùò ÌèâÍ∑† Ïò®ÎèÑ ÏïåÎ†§Ï§ò                      True   
1              ÌòÑÏû¨ ÏÑ§Ï†ïÏò®ÎèÑÎûë Ïã§ÎÇ¥Ïò®ÎèÑ Ï∞®Ïù¥ ÏïåÎ†§Ï§ò.                      True   
2   ÏßÄÎÇúÎã¨Ïóê ÏÑ§Ï†ïÏò®ÎèÑÏôÄ Ïã§ÎÇ¥Ïò®ÎèÑ Ï∞®Ïù¥Í∞Ä Í∞ÄÏû• ÎßéÏù¥ ÎÇ¨Îçò ÎÇ†ÏùÄ?                      True   
3        Ïù¥Î≤àÏ£º Ïö∞Î¶¨Î∞òÍ≥º ÏòÜÎ∞òÏùò ÌèâÍ∑† Ïã§ÎÇ¥Ïò®ÎèÑ Ï∞®Ïù¥ ÏïåÎ†§Ï§ò                      True   
4                   2Ï£ºÏ†Ñ Í∞ÄÏû• ÎçîÏõ†Îçò ÎÇ† ÏïåÎ†§Ï§ò                      True   
5                      ÌôîÏÑ±Ïùò ÏÑ§Ï†ïÏò®ÎèÑ ÌôïÏù∏Ìï¥Ï§ò                      True   
6  

  0%|          | 0/12 [00:00<?, ?it/s]

../experiments/result_3rdyear/r-3rdyear_r211_a422_sh2orc-Llama-3.1-Korean-8B-Instruct_tr27_16bit-step-41.json sh2orc-Llama-3.1-Korean-8B-Instruct
Input, Ïù¥Î≤àÏ£º Ïö∞Î¶¨Î∞òÍ≥º ÏïûÎ∞òÏùò ÌèâÍ∑† Ïò®ÎèÑ ÏïåÎ†§Ï§ò ,expertLLM_output_tlen, 446
cols: ['roomtemp', 'idu_name', 'id', 'timestamp']
cols: ['roomtemp', 'idu_name', 'id', 'timestamp']


 17%|‚ñà‚ñã        | 2/12 [00:01<00:07,  1.42it/s]

rg_last_input_token_length, 209 ,rg_last_output_token_length, 45
../experiments/result_3rdyear/r-3rdyear_r211_a422_sh2orc-Llama-3.1-Korean-8B-Instruct_tr27_16bit-step-41.json sh2orc-Llama-3.1-Korean-8B-Instruct
Input, ÌòÑÏû¨ ÏÑ§Ï†ïÏò®ÎèÑÎûë Ïã§ÎÇ¥Ïò®ÎèÑ Ï∞®Ïù¥ ÏïåÎ†§Ï§ò. ,expertLLM_output_tlen, 338
cols: ['roomtemp', 'idu_name', 'settemp', 'id', 'timestamp']
Error in executing script: v_ÌòÑÏû¨_Ïö∞Î¶¨Î∞ò_ÏÑ§Ï†ïÏò®ÎèÑ-ÌòÑÏû¨_Ïö∞Î¶¨Î∞ò_Ïã§ÎÇ¥Ïò®ÎèÑ = v_ÌòÑÏû¨_Ïö∞Î¶¨Î∞ò_ÏÑ§Ï†ïÏò®ÎèÑ - v_ÌòÑÏû¨_Ïö∞Î¶¨Î∞ò_Ïã§ÎÇ¥Ïò®ÎèÑ
cannot assign to expression here. Maybe you meant '==' instead of '='? (<string>, line 1)
Error in running query_v2: cannot assign to expression here. Maybe you meant '==' instead of '='? (<string>, line 1)
../experiments/result_3rdyear/r-3rdyear_r211_a422_sh2orc-Llama-3.1-Korean-8B-Instruct_tr27_16bit-step-41.json sh2orc-Llama-3.1-Korean-8B-Instruct
Input, ÏßÄÎÇúÎã¨Ïóê ÏÑ§Ï†ïÏò®ÎèÑÏôÄ Ïã§ÎÇ¥Ïò®ÎèÑ Ï∞®Ïù¥Í∞Ä Í∞ÄÏû• ÎßéÏù¥ ÎÇ¨Îçò ÎÇ†ÏùÄ? ,expertLLM_output_tlen, 763
cols: ['idu_n

 25%|‚ñà‚ñà‚ñå       | 3/12 [00:08<00:33,  3.68s/it]

cols: ['roomtemp', 'idu_name', 'id', 'timestamp']
Error in executing script: v_ÏßÄÎÇúÎã¨_ÏÑ§Ï†ïÏò®ÎèÑ_Ïã§ÎÇ¥Ïò®ÎèÑ_df = pd.merge(v_ÏßÄÎÇúÎã¨_ÏÑ§Ï†ïÏò®ÎèÑ_df, v_ÏßÄÎÇúÎã¨_Ïã§ÎÇ¥Ïò®ÎèÑ_df, how='inner', on=['date','spatial'])
'date'
Error in running query_v2: 'date'
../experiments/result_3rdyear/r-3rdyear_r211_a422_sh2orc-Llama-3.1-Korean-8B-Instruct_tr27_16bit-step-41.json sh2orc-Llama-3.1-Korean-8B-Instruct
Input, Ïù¥Î≤àÏ£º Ïö∞Î¶¨Î∞òÍ≥º ÏòÜÎ∞òÏùò ÌèâÍ∑† Ïã§ÎÇ¥Ïò®ÎèÑ Ï∞®Ïù¥ ÏïåÎ†§Ï§ò ,expertLLM_output_tlen, 443
cols: ['roomtemp', 'idu_name', 'id', 'timestamp']
cols: ['roomtemp', 'idu_name', 'id', 'timestamp']


 33%|‚ñà‚ñà‚ñà‚ñé      | 4/12 [00:10<00:21,  2.75s/it]

rg_last_input_token_length, 212 ,rg_last_output_token_length, 42
../experiments/result_3rdyear/r-3rdyear_r211_a422_sh2orc-Llama-3.1-Korean-8B-Instruct_tr27_16bit-step-41.json sh2orc-Llama-3.1-Korean-8B-Instruct
Input, 2Ï£ºÏ†Ñ Í∞ÄÏû• ÎçîÏõ†Îçò ÎÇ† ÏïåÎ†§Ï§ò ,expertLLM_output_tlen, 390


 42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 5/12 [00:11<00:15,  2.24s/it]

cols: ['roomtemp', 'idu_name', 'id', 'timestamp']
rg_last_input_token_length, 56 ,rg_last_output_token_length, 14
../experiments/result_3rdyear/r-3rdyear_r211_a422_sh2orc-Llama-3.1-Korean-8B-Instruct_tr27_16bit-step-41.json sh2orc-Llama-3.1-Korean-8B-Instruct
Input, ÌôîÏÑ±Ïùò ÏÑ§Ï†ïÏò®ÎèÑ ÌôïÏù∏Ìï¥Ï§ò ,expertLLM_output_tlen, 182
cols: ['idu_name', 'settemp', 'id', 'timestamp']


 50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 6/12 [00:11<00:09,  1.60s/it]

rg_last_input_token_length, 71 ,rg_last_output_token_length, 16
../experiments/result_3rdyear/r-3rdyear_r211_a422_sh2orc-Llama-3.1-Korean-8B-Instruct_tr27_16bit-step-41.json sh2orc-Llama-3.1-Korean-8B-Instruct
Input, ÏòÜÎ∞ò ÏäµÎèÑ ÏïåÎ†§Ï§ò ,expertLLM_output_tlen, 109
../experiments/result_3rdyear/r-3rdyear_r211_a422_sh2orc-Llama-3.1-Korean-8B-Instruct_tr27_16bit-step-41.json sh2orc-Llama-3.1-Korean-8B-Instruct
Input, ÏßÄÎÇú 3Ïùº ÎèôÏïà Ïö∞Î¶¨Î∞ò Ïã§ÎÇ¥ Ïò®ÎèÑ ÌèâÍ∑† Í∞í ÏïåÎ†§Ï§ò. ,expertLLM_output_tlen, 248
cols: ['roomtemp', 'idu_name', 'id', 'timestamp']


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 8/12 [00:12<00:03,  1.01it/s]

rg_last_input_token_length, 98 ,rg_last_output_token_length, 23
../experiments/result_3rdyear/r-3rdyear_r211_a422_sh2orc-Llama-3.1-Korean-8B-Instruct_tr27_16bit-step-41.json sh2orc-Llama-3.1-Korean-8B-Instruct
Input, Ïò§Îäò Ïò§ÌõÑ 5ÏãúÏóê ÏòÜÎ∞òÏùò ÏÑ§Ï†ïÏò®ÎèÑÎäî Ïñ¥Îï†Ïñ¥? ,expertLLM_output_tlen, 244
cols: ['idu_name', 'settemp', 'id', 'timestamp']


 75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 9/12 [00:13<00:02,  1.18it/s]

rg_last_input_token_length, 107 ,rg_last_output_token_length, 22
../experiments/result_3rdyear/r-3rdyear_r211_a422_sh2orc-Llama-3.1-Korean-8B-Instruct_tr27_16bit-step-41.json sh2orc-Llama-3.1-Korean-8B-Instruct
Input, Ïò¨Ìï¥ Ïó¨Î¶Ñ Ïö∞Î¶¨Î∞ò Ïã§ÎÇ¥Ïò®ÎèÑ ÏµúÎåÄÍ∞íÍ≥º ÏµúÏÜåÍ∞í ÏïåÎ†§Ï§ò ,expertLLM_output_tlen, 340
cols: ['roomtemp', 'idu_name', 'id', 'timestamp']


 83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 10/12 [00:15<00:02,  1.18s/it]

rg_last_input_token_length, 157 ,rg_last_output_token_length, 41
../experiments/result_3rdyear/r-3rdyear_r211_a422_sh2orc-Llama-3.1-Korean-8B-Instruct_tr27_16bit-step-41.json sh2orc-Llama-3.1-Korean-8B-Instruct
Input, Ïö∞Î¶¨Î∞òÍ≥º ÏïûÎ∞ò Ï§ë Í∞ÄÏû• ÎçîÏö¥ Î∞©ÏùÄ? ,expertLLM_output_tlen, 678
cols: ['roomtemp', 'idu_name', 'id', 'timestamp']
cols: ['roomtemp', 'idu_name', 'id', 'timestamp']


 92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 11/12 [00:15<00:01,  1.04s/it]

rg_last_input_token_length, 248 ,rg_last_output_token_length, 29
../experiments/result_3rdyear/r-3rdyear_r211_a422_sh2orc-Llama-3.1-Korean-8B-Instruct_tr27_16bit-step-41.json sh2orc-Llama-3.1-Korean-8B-Instruct
Input, ÏßÄÍ∏à 4Ï∏µ ÌèâÍ∑† Ïã§ÎÇ¥Ïò®ÎèÑ ÏïåÎ†§Ï§ò ,expertLLM_output_tlen, 219
cols: ['roomtemp', 'idu_name', 'id', 'timestamp']


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 12/12 [00:16<00:00,  1.35s/it]


rg_last_input_token_length, 82 ,rg_last_output_token_length, 19
Time: [1.5104033946990967, 0.13995862007141113, 7.2114198207855225, 1.3096137046813965, 1.3305034637451172, 0.3611128330230713, 0.00021505355834960938, 0.7338883876800537, 0.4345383644104004, 2.0604357719421387, 0.6744613647460938, 0.39554452896118164], 1.346841275691986
                               Input  JsonStructureCorrectness  \
0             Ïù¥Î≤àÏ£º Ïö∞Î¶¨Î∞òÍ≥º ÏïûÎ∞òÏùò ÌèâÍ∑† Ïò®ÎèÑ ÏïåÎ†§Ï§ò                      True   
1              ÌòÑÏû¨ ÏÑ§Ï†ïÏò®ÎèÑÎûë Ïã§ÎÇ¥Ïò®ÎèÑ Ï∞®Ïù¥ ÏïåÎ†§Ï§ò.                      True   
2   ÏßÄÎÇúÎã¨Ïóê ÏÑ§Ï†ïÏò®ÎèÑÏôÄ Ïã§ÎÇ¥Ïò®ÎèÑ Ï∞®Ïù¥Í∞Ä Í∞ÄÏû• ÎßéÏù¥ ÎÇ¨Îçò ÎÇ†ÏùÄ?                      True   
3        Ïù¥Î≤àÏ£º Ïö∞Î¶¨Î∞òÍ≥º ÏòÜÎ∞òÏùò ÌèâÍ∑† Ïã§ÎÇ¥Ïò®ÎèÑ Ï∞®Ïù¥ ÏïåÎ†§Ï§ò                      True   
4                   2Ï£ºÏ†Ñ Í∞ÄÏû• ÎçîÏõ†Îçò ÎÇ† ÏïåÎ†§Ï§ò                      True   
5                      ÌôîÏÑ±Ïùò ÏÑ§Ï†ïÏò®ÎèÑ ÌôïÏù∏Ìï¥Ï§ò                      True   
6 

  0%|          | 0/12 [00:00<?, ?it/s]

../experiments/result_3rdyear/r-3rdyear_r450_a900_Bllossom-llama-3.2-Korean-Bllossom-3B_tr27_16bit-step-70.json Bllossom-llama-3.2-Korean-Bllossom-3B
Input, Ïù¥Î≤àÏ£º Ïö∞Î¶¨Î∞òÍ≥º ÏïûÎ∞òÏùò ÌèâÍ∑† Ïò®ÎèÑ ÏïåÎ†§Ï§ò ,expertLLM_output_tlen, 362
cols: ['roomtemp', 'idu_name', 'id', 'timestamp']
cols: ['roomtemp', 'idu_name', 'id', 'timestamp']


  8%|‚ñä         | 1/12 [00:01<00:16,  1.51s/it]

rg_last_input_token_length, 153 ,rg_last_output_token_length, 36
../experiments/result_3rdyear/r-3rdyear_r450_a900_Bllossom-llama-3.2-Korean-Bllossom-3B_tr27_16bit-step-70.json Bllossom-llama-3.2-Korean-Bllossom-3B
Input, ÌòÑÏû¨ ÏÑ§Ï†ïÏò®ÎèÑÎûë Ïã§ÎÇ¥Ïò®ÎèÑ Ï∞®Ïù¥ ÏïåÎ†§Ï§ò. ,expertLLM_output_tlen, 262
cols: ['idu_name', 'settemp', 'id', 'timestamp']
cols: ['roomtemp', 'idu_name', 'id', 'timestamp']


 17%|‚ñà‚ñã        | 2/12 [00:02<00:10,  1.04s/it]

rg_last_input_token_length, 97 ,rg_last_output_token_length, 32
../experiments/result_3rdyear/r-3rdyear_r450_a900_Bllossom-llama-3.2-Korean-Bllossom-3B_tr27_16bit-step-70.json Bllossom-llama-3.2-Korean-Bllossom-3B
Input, ÏßÄÎÇúÎã¨Ïóê ÏÑ§Ï†ïÏò®ÎèÑÏôÄ Ïã§ÎÇ¥Ïò®ÎèÑ Ï∞®Ïù¥Í∞Ä Í∞ÄÏû• ÎßéÏù¥ ÎÇ¨Îçò ÎÇ†ÏùÄ? ,expertLLM_output_tlen, 569
cols: ['roomtemp', 'idu_name', 'settemp', 'id', 'timestamp']
get_time, col: Index(['Ïã§ÎÇ¥Ïò®ÎèÑ', 'idu_name', 'ÏÑ§Ï†ïÏò®ÎèÑ', 'timestamp', 'Ïò®ÎèÑÏ∞®Ïù¥'], dtype='object'), fmt: %Y-%m-%d


 25%|‚ñà‚ñà‚ñå       | 3/12 [00:05<00:17,  1.93s/it]

rg_last_input_token_length, 368 ,rg_last_output_token_length, 99
../experiments/result_3rdyear/r-3rdyear_r450_a900_Bllossom-llama-3.2-Korean-Bllossom-3B_tr27_16bit-step-70.json Bllossom-llama-3.2-Korean-Bllossom-3B
Input, Ïù¥Î≤àÏ£º Ïö∞Î¶¨Î∞òÍ≥º ÏòÜÎ∞òÏùò ÌèâÍ∑† Ïã§ÎÇ¥Ïò®ÎèÑ Ï∞®Ïù¥ ÏïåÎ†§Ï§ò ,expertLLM_output_tlen, 447
cols: ['roomtemp', 'idu_name', 'id', 'timestamp']
cols: ['roomtemp', 'idu_name', 'id', 'timestamp']


 33%|‚ñà‚ñà‚ñà‚ñé      | 4/12 [00:06<00:14,  1.79s/it]

rg_last_input_token_length, 212 ,rg_last_output_token_length, 42
../experiments/result_3rdyear/r-3rdyear_r450_a900_Bllossom-llama-3.2-Korean-Bllossom-3B_tr27_16bit-step-70.json Bllossom-llama-3.2-Korean-Bllossom-3B
Input, 2Ï£ºÏ†Ñ Í∞ÄÏû• ÎçîÏõ†Îçò ÎÇ† ÏïåÎ†§Ï§ò ,expertLLM_output_tlen, 379
cols: ['roomtemp', 'idu_name', 'id', 'timestamp']
get_time, col: Index(['Ïã§ÎÇ¥Ïò®ÎèÑ', 'idu_name', 'timestamp'], dtype='object'), fmt: %Y-%m-%d


 50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 6/12 [00:08<00:06,  1.12s/it]

rg_last_input_token_length, 166 ,rg_last_output_token_length, 59
../experiments/result_3rdyear/r-3rdyear_r450_a900_Bllossom-llama-3.2-Korean-Bllossom-3B_tr27_16bit-step-70.json Bllossom-llama-3.2-Korean-Bllossom-3B
Input, ÌôîÏÑ±Ïùò ÏÑ§Ï†ïÏò®ÎèÑ ÌôïÏù∏Ìï¥Ï§ò ,expertLLM_output_tlen, 184
cols: ['idu_name', 'settemp', 'id', 'timestamp']
Error in executing script: v_ÌòÑÏû¨_ÌôîÏÑ±_ÏÑ§Ï†ïÏò®ÎèÑ = v_ÌòÑÏû¨_ÌôîÏÑ±_ÏÑ§Ï†ïÏò®ÎèÑ_df['ÏÑ§Ï†ïÏò®ÎèÑ'].values[0]
index 0 is out of bounds for axis 0 with size 0
Error in running query_v2: index 0 is out of bounds for axis 0 with size 0
../experiments/result_3rdyear/r-3rdyear_r450_a900_Bllossom-llama-3.2-Korean-Bllossom-3B_tr27_16bit-step-70.json Bllossom-llama-3.2-Korean-Bllossom-3B
Input, ÏòÜÎ∞ò ÏäµÎèÑ ÏïåÎ†§Ï§ò ,expertLLM_output_tlen, 108
../experiments/result_3rdyear/r-3rdyear_r450_a900_Bllossom-llama-3.2-Korean-Bllossom-3B_tr27_16bit-step-70.json Bllossom-llama-3.2-Korean-Bllossom-3B
Input, ÏßÄÎÇú 3Ïùº ÎèôÏïà Ïö∞Î¶¨Î∞ò Ïã§ÎÇ¥ Ïò®ÎèÑ ÌèâÍ∑† Í∞í ÏïåÎ†§

 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 8/12 [00:08<00:02,  1.39it/s]

rg_last_input_token_length, 99 ,rg_last_output_token_length, 24
../experiments/result_3rdyear/r-3rdyear_r450_a900_Bllossom-llama-3.2-Korean-Bllossom-3B_tr27_16bit-step-70.json Bllossom-llama-3.2-Korean-Bllossom-3B
Input, Ïò§Îäò Ïò§ÌõÑ 5ÏãúÏóê ÏòÜÎ∞òÏùò ÏÑ§Ï†ïÏò®ÎèÑÎäî Ïñ¥Îï†Ïñ¥? ,expertLLM_output_tlen, 240
cols: ['idu_name', 'settemp', 'id', 'timestamp']


 75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 9/12 [00:09<00:01,  1.55it/s]

rg_last_input_token_length, 93 ,rg_last_output_token_length, 22
../experiments/result_3rdyear/r-3rdyear_r450_a900_Bllossom-llama-3.2-Korean-Bllossom-3B_tr27_16bit-step-70.json Bllossom-llama-3.2-Korean-Bllossom-3B
Input, Ïò¨Ìï¥ Ïó¨Î¶Ñ Ïö∞Î¶¨Î∞ò Ïã§ÎÇ¥Ïò®ÎèÑ ÏµúÎåÄÍ∞íÍ≥º ÏµúÏÜåÍ∞í ÏïåÎ†§Ï§ò ,expertLLM_output_tlen, 552
cols: ['roomtemp', 'idu_name', 'id', 'timestamp']


 83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 10/12 [00:11<00:02,  1.16s/it]

rg_last_input_token_length, 229 ,rg_last_output_token_length, 79
../experiments/result_3rdyear/r-3rdyear_r450_a900_Bllossom-llama-3.2-Korean-Bllossom-3B_tr27_16bit-step-70.json Bllossom-llama-3.2-Korean-Bllossom-3B
Failed to parse input:  Ïö∞Î¶¨Î∞òÍ≥º ÏïûÎ∞ò Ï§ë Í∞ÄÏû• ÎçîÏö¥ Î∞©ÏùÄ? {"Thinking": "ÏÇ¨Ïö©ÏûêÎäî ÌòÑÏû¨ Ïö∞Î¶¨Î∞òÍ≥º ÏïûÎ∞ò Ï§ë Ïã§ÎÇ¥Ïò®ÎèÑÍ∞Ä Îçî ÎÜíÏùÄ Î∞©ÏùÑ ÏïåÍ≥†Ïã∂Ïñ¥Ìï®. ÌòÑÏû¨ Ïö∞Î¶¨Î∞òÍ≥º ÏïûÎ∞òÏùò Ïã§ÎÇ¥Ïò®ÎèÑÎ•º ÏøºÎ¶¨Ìïú ÌõÑ Îçî ÎÜíÏùÄ Ïã§ÎÇ¥Ïò®ÎèÑÎ•º Í∞ÄÏßÑ Î∞©Í≥º Í∑∏ Ïò®ÎèÑÎ•º Î∞òÌôòÌïòÎ©¥ Îê®.", "Expectations": ["{{ÌòÑÏû¨_Ïã§ÎÇ¥Ïò®ÎèÑ_Îçî¬†jealous}}Ïù¥({{ÌòÑÏû¨_Ïã§ÎÇ¥Ïò®ÎèÑ_Îçî¬†jealous}}‚ÑÉ) {{ÌòÑÏû¨_Ïã§ÎÇ¥Ïò®ÎèÑ_Îçî¬†jealous}}‚ÑÉÎ°ú {{ÌòÑÏû¨_Ïã§ÎÇ¥Ïò®ÎèÑ_Îçî¬†jealous}}‚ÑÉ Îçî ÎçîÏö¥ Î∞©ÏûÖÎãàÎã§."], "Mapping": {"temporal": {"ÌòÑÏû¨": "LAST_RECORD"}, "spatials": {"Ïö∞Î¶¨Î∞ò": "01_IB5", "ÏïûÎ∞ò": "01_IB7"}, "modalities": {"Ïã§ÎÇ¥Ïò®ÎèÑ": "roomtemp"}}, "Script": ["v_ÌòÑÏû¨_Ïö∞Î¶¨Î∞ò_Ïã§ÎÇ¥Ïò®ÎèÑ_df = data(t='ÌòÑÏû¨',s='Ïö∞Î¶¨Î∞ò',m='Ïã§ÎÇ¥Ïò®ÎèÑ')", "v_ÌòÑÏû¨

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 12/12 [00:12<00:00,  1.02s/it]

rg_last_input_token_length, 82 ,rg_last_output_token_length, 19
Time: [1.5098509788513184, 0.7125959396362305, 2.987276792526245, 1.5716493129730225, 1.348240852355957, 0.14041733741760254, 0.00035572052001953125, 0.5936524868011475, 0.43607401847839355, 2.5615596771240234, 0.3954331874847412], 1.1142823912880637
                               Input  JsonStructureCorrectness  \
0             Ïù¥Î≤àÏ£º Ïö∞Î¶¨Î∞òÍ≥º ÏïûÎ∞òÏùò ÌèâÍ∑† Ïò®ÎèÑ ÏïåÎ†§Ï§ò                      True   
1              ÌòÑÏû¨ ÏÑ§Ï†ïÏò®ÎèÑÎûë Ïã§ÎÇ¥Ïò®ÎèÑ Ï∞®Ïù¥ ÏïåÎ†§Ï§ò.                      True   
2   ÏßÄÎÇúÎã¨Ïóê ÏÑ§Ï†ïÏò®ÎèÑÏôÄ Ïã§ÎÇ¥Ïò®ÎèÑ Ï∞®Ïù¥Í∞Ä Í∞ÄÏû• ÎßéÏù¥ ÎÇ¨Îçò ÎÇ†ÏùÄ?                      True   
3        Ïù¥Î≤àÏ£º Ïö∞Î¶¨Î∞òÍ≥º ÏòÜÎ∞òÏùò ÌèâÍ∑† Ïã§ÎÇ¥Ïò®ÎèÑ Ï∞®Ïù¥ ÏïåÎ†§Ï§ò                      True   
4                   2Ï£ºÏ†Ñ Í∞ÄÏû• ÎçîÏõ†Îçò ÎÇ† ÏïåÎ†§Ï§ò                      True   
5                      ÌôîÏÑ±Ïùò ÏÑ§Ï†ïÏò®ÎèÑ ÌôïÏù∏Ìï¥Ï§ò                      True   
6                      




In [None]:
responses = {}

for name in names:
    # Î™®Îç∏ ÌÅ¨Í∏∞ Ï∂îÏ∂ú (8B ÎòêÎäî 3B)
    model_size_match = re.search(r'(\d+B)', name)
    model_size = model_size_match.group(1) if model_size_match else None
    
    # ÎπÑÌä∏ Ïàò Ï∂îÏ∂ú (4bit, 8bit, 16bit)
    bit_match = re.search(r'_(\d+bit)', name)
    bit_size = bit_match.group(1) if bit_match else None
    
    # exp_tag ÏÉùÏÑ±
    if model_size and bit_size:
        exp_tag = f"{model_size}_{bit_size}"
    print(exp_tag)
    with open(f"../experiments/result_3rdyear/{name}_response.json", "r", encoding="utf-8") as f:
        data = json.load(f)
        for item in data:
            input = item["Input"]
            if input not in responses:
                responses[input] = {
                    "GT_Response": item["GT_Response"],
                }
            # if "GT_Response" in item:
            #     print(item["GT_Response"])
            if "PD_Response" in item:
                pd_response = item["PD_Response"]
            else:
                pd_response = "Ïã§ÌñâÏ§ë ÏóêÎü¨ Î∞úÏÉù"
            
            responses[input][f"{exp_tag}"] = pd_response

# if the response is exactly equal, then merge them and make in to one, key is then tuple
for input, response in responses.items():
    if len(response) == 1:
        continue
    
    # merge every matching pd_response (not only first one but every combination)
    # create groups of responses with same values
    from collections import defaultdict
    
    # group responses by their values (excluding GT_Response)
    value_groups = defaultdict(list)
    
    for key, value in response.items():
        if key != "GT_Response":
            value_groups[value].append(key)
    
    # merge keys that have the same response values
    merged_responses = {}
    for value, keys in value_groups.items():
        if len(keys) > 1:
            # create tuple key for merged responses
            merged_key = str(tuple(sorted(keys)))
            merged_responses[merged_key] = value
        else:
            # keep single responses as is
            merged_responses[keys[0]] = value
    
    # add back GT_Response
    merged_responses["GT_Response"] = response["GT_Response"]
    
    # for key in merged_responses:
    #     if isinstance(merged_responses[key], list):
    #         merged_responses[key] = " ".join(merged_responses[key])

    # update responses dict
    responses[input] = merged_responses

# import pprint
# pprint.pprint(responses)

# save to json
with open("responses.json", "w", encoding="utf-8") as f:
    json.dump(responses, f, ensure_ascii=False, indent=4)
print(responses)

8B_4bit
8B_8bit
8B_16bit
3B_16bit
{'Ïù¥Î≤àÏ£º Ïö∞Î¶¨Î∞òÍ≥º ÏïûÎ∞òÏùò ÌèâÍ∑† Ïò®ÎèÑ ÏïåÎ†§Ï§ò': {'8B_4bit': 'Ïù¥Î≤àÏ£º Ïö∞Î¶¨Î∞òÏùò ÌèâÍ∑† Ïã§ÎÇ¥Ïò®ÎèÑÎäî 25.98‚ÑÉÏù¥Í≥†, ÏïûÎ∞òÏùò ÌèâÍ∑† Ïã§ÎÇ¥Ïò®ÎèÑÎäî 25.11‚ÑÉÏûÖÎãàÎã§. Ïù¥Î≤àÏ£º Ïö∞Î¶¨Î∞òÎ≥¥Îã§ ÏïûÎ∞òÏùò ÌèâÍ∑† Ïã§ÎÇ¥Ïò®ÎèÑÍ∞Ä -0.87‚ÑÉ ÎÇÆÏäµÎãàÎã§.', "('3B_16bit', '8B_8bit')": 'Ïù¥Î≤àÏ£º Ïö∞Î¶¨Î∞òÏùò ÌèâÍ∑† Ïã§ÎÇ¥Ïò®ÎèÑÎäî 25.98‚ÑÉÏù¥Í≥†, ÏïûÎ∞òÏùò ÌèâÍ∑† Ïã§ÎÇ¥Ïò®ÎèÑÎäî 25.11‚ÑÉÏûÖÎãàÎã§.', '8B_16bit': 'Ïù¥Î≤àÏ£º Ïö∞Î¶¨Î∞òÏùò ÌèâÍ∑† Ïã§ÎÇ¥Ïò®ÎèÑÎäî 25.98‚ÑÉÏù¥Í≥†, ÏïûÎ∞òÏùò ÌèâÍ∑† Ïã§ÎÇ¥Ïò®ÎèÑÎäî 25.11‚ÑÉÏûÖÎãàÎã§. Ï∞®Ïù¥Îäî 0.87‚ÑÉÏûÖÎãàÎã§.', 'GT_Response': 'Ïù¥Î≤àÏ£º Ïö∞Î¶¨Î∞òÏùò ÌèâÍ∑† Ïã§ÎÇ¥Ïò®ÎèÑÎäî 25.98¬∞CÏù¥Í≥†, ÏïûÎ∞òÏùò ÌèâÍ∑† Ïã§ÎÇ¥Ïò®ÎèÑÎäî 25.11¬∞CÏûÖÎãàÎã§.'}, 'ÌòÑÏû¨ ÏÑ§Ï†ïÏò®ÎèÑÎûë Ïã§ÎÇ¥Ïò®ÎèÑ Ï∞®Ïù¥ ÏïåÎ†§Ï§ò.': {'8B_4bit': 'ÌòÑÏû¨ Ïö∞Î¶¨Î∞òÏùò ÏÑ§Ï†ïÏò®ÎèÑ(23.00‚ÑÉ)Îäî Ïã§ÎÇ¥Ïò®ÎèÑ(27.00‚ÑÉ)Î≥¥Îã§ 4.00‚ÑÉ ÎÜíÏäµÎãàÎã§.', '8B_8bit': 'ÌòÑÏû¨ Ïö∞Î¶¨Î∞òÏùò ÏÑ§Ï†ïÏò®ÎèÑ(23.00‚ÑÉ)ÏôÄ Ïã§ÎÇ¥Ïò®ÎèÑ(27.00‚ÑÉ)Ïùò Ï∞®Ïù¥Î