In [18]:
# autoreload
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [19]:
import sys
import os

# Add the parent directory of src to the path
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.db.manager import DBManager
from src.input_to_instructions.load_and_execute import *
from src.input_to_instructions.types import *
from src.operation.execute import *
from src.response_generation.load_and_execute import *
from src.dateutils import normalize_sql_dates


In [None]:
from collections import defaultdict
import logging

import pandas as pd
import numpy as np
from tqdm import tqdm
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

import json
import itertools

# from db.manager import DBManager
from operation.execute import OperationExecutor
from pathlib import Path
import warnings
import datetime


warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)

In [21]:
BASE_DIR = "../"
def read_json(path):
    with open(path, "r", encoding="utf-8") as f:
        result = json.loads(f.read())
    
    # result = [{"Input": d["Input"], "Response": json.dumps(d["Response"], ensure_ascii=False)} for d in result]
    return result

In [22]:
if torch.cuda.get_device_capability()[0] >= 8:
    attn_implementation = "flash_attention_2"
    torch_dtype = torch.bfloat16
else:
    attn_implementation = "eager"
    torch_dtype = torch.float16
print(f"attn_implementation: {attn_implementation}, torch_dtype: {torch_dtype}")


attn_implementation: flash_attention_2, torch_dtype: torch.bfloat16


In [23]:
# ResponseGeneration.update_prompt()

ResponseGeneration.initialize(
    log_output=False,
    instance_type="unsloth"
)
tokenizer = ResponseGeneration.tokenizer
print(tokenizer)

Are you certain you want to do remote code execution?
==((====))==  Unsloth 2025.6.11: Fast Siglip patching. Transformers: 4.53.0.
   \\   /|    NVIDIA H100 80GB HBM3. Num GPUs = 1. Max memory: 79.109 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 9.0. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Siglip does not support SDPA - switching to eager!
Unsloth: QLoRA and full finetuning all not selected. Switching to 16bit LoRA.


Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

GPT2TokenizerFast(name_or_path='LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct', vocab_size=102400, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '[BOS]', 'eos_token': '[|endofturn|]', 'unk_token': '[UNK]', 'pad_token': '[PAD]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[BOS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[EOS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("                               ", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	5: AddedToken("                              ", rstrip=False, lstrip=False, singl

In [24]:
import time
from src.input_to_instructions.types import InstructionQ_raw
def get_time(df, fmt="datetime"):
    # from df get 'timestamp' column and return them in format
    if fmt == "date":
        fmt = '%Y-%m-%d'
    elif fmt == "month":
        fmt = '%Y-%m'
    elif fmt == "year":
        fmt = '%Y'
    else:
        fmt = '%Y-%m-%d %H:%M:%S'
    
    if isinstance(df['timestamp'], pd.Timestamp):
        result = df['timestamp'].strftime(fmt)
    else:
        result = df['timestamp'].apply(lambda x: x.strftime(fmt))
    return sorted(list(set(result)))

def get_spatials(df):
    return pd.unique(df['idu_name'])

def get_tv(df, col:str|list[str], fmt="datetime"):
    if isinstance(col, str):
        col = [col]
    
    timestamps = get_time(df, fmt)
    return_tuple = tuple([timestamps] + [df[c] for c in col])
    return return_tuple

def data_(metadata, mapping, query_results, t=str|list[str], s=str|list[str], m=str|list[str]):
    if isinstance(t, str):
        t = [t]
    if isinstance(s, str):
        s = [s]
    if isinstance(m, str):
        m = [m]

    t_raw = [mapping.temporal[t_highlevel] for t_highlevel in t]
    s_raw = [mapping.spatials[s_highlevel] for s_highlevel in s]
    m_raw = [mapping.modalities[m_highlevel] for m_highlevel in m]
    
    # flatten s_raw into a list of strings
    # flattened = [item for sublist in data for item in (sublist if isinstance(sublist, list) else [sublist])]
    s_raw = [item for sublist in s_raw for item in (sublist if isinstance(sublist, list) else [sublist])]
    # print(s_raw)
    result_df = DBManager.structured_query_data_t_v2(metadata, m_raw, t_raw, s_raw, get_rowids=True)
    
    cols = list(result_df.columns)
    cols.remove("id")
    cols.remove("idu_name")
    cols.remove("timestamp")
    rows = list(result_df["id"])
    query_results.append({
        "result_columns": cols,
        "result_indices": rows,
    })
    # print(cols, rows)

    # For demo, drop rows where any value is -1
    result_df = result_df.loc[(result_df != -1).all(axis=1)]

    # drop "id" from result_df
    result_df = result_df.drop(columns=['id'])

    # change column names to high level
    inverse_mapping = {v: k for k, v in mapping.modalities.items()}
    result_df.columns = [inverse_mapping[col] if col in inverse_mapping else col for col in result_df.columns]

    # change idu_name raw values to high level
    inverse_mapping = {}
    for k, v in mapping.spatials.items():
        if isinstance(v, list):
            for v_ in v:
                inverse_mapping[v_] = k
        else:
            inverse_mapping[v] = k

    result_df["idu_name"] = result_df["idu_name"].map(inverse_mapping)

    return result_df


def run_query_v2(user_input, metadata, mapping, expectations, required_variables, scripts, exp_tag=None):
    query_results = []
    variables = {}
    # print(f"exp_tag: {exp_tag}")
    if scripts is not None:

        # search data(t=~~, ...,)
        globals()['metadata'] = metadata
        globals()['mapping'] = mapping
        globals()['query_results'] = query_results
        for name in list(globals()):
            if name.startswith("v_"):
                del globals()[name]
        try:
            query_time = 0
            process_time = 0
            
            for script in scripts:
                try:
                    start_time = time.time()
                    if "data" in script:
                        script = script.replace("data(", "data_(metadata, mapping, query_results, ")
                    
                    if "SELECT" in script:
                        # split only at the first '=' to avoid issues with '=' in SQL
                        variable, sql = script.split("=", 1)
                        variable = variable.strip()
                        sql = sql.strip()
                        # get all between \" and \"
                        sql = re.findall(r'"(.*)"', sql)
                        sql = sql[0]
                        # "SELECT"라는 첫 번째 등장만 "SELECT id "로 대체합니다.
                        sql = sql.replace("SELECT", "SELECT id, ", 1)
                        df = DBManager.execute_structured_query_string(sql)
                        cols = list(df.columns)
                        cols.remove("id")
                        cols.remove("idu_name")
                        cols.remove("timestamp")
                        rows = list(df["id"])
                        query_results.append({
                            "result_columns": cols,
                            "result_indices": rows,
                        })
                        df = df.drop(columns=['id'])
                        globals()[variable] = df
                    else:
                        exec(script, globals())
                    
                    end_time = time.time()
                    if "data" in script:
                        query_time += end_time - start_time
                    else:
                        process_time += end_time - start_time
                except Exception as e:
                    print(f"Error in executing script: {script}")
                    print(e)
                    raise e

            start_time = time.time()
            variables = {name:globals()[name] for name in globals() if name.startswith("v_")}
            response, required_variables = ResponseGeneration.execute_v2(expectations, required_variables, variables, user_input, exp_tag=exp_tag)
            rg_last_input_token_length = measure_token_count(ResponseGeneration.last_input_str)
            rg_last_output_token_length = measure_token_count(response)
            print("rg_last_input_token_length,", rg_last_input_token_length, ",rg_last_output_token_length,", rg_last_output_token_length)
            
            response_generation_time = time.time() - start_time

            # print(f"질문: {user_input}, 쿼리 실행 시간: {query_time:.4f}초, 프로세스 실행 시간: {process_time:.4f}초, 응답 생성 시간: {response_generation_time:.4f}초")
            return response, variables, required_variables, query_results
        except Exception as e:
            print(f"Error in running query_v2: {e}")
            return "실행중 에러가 발생했습니다.", variables, None, query_results
    else:
        if exp_tag in ["woQM", "woQM+Script"]:
            response, required_variables = ResponseGeneration.execute_v2(expectations, required_variables, variables, user_input, exp_tag=exp_tag)
            return response, variables, required_variables, query_results
        else:
            variables = {}
            unknown_spatials = [k for k, v in mapping.spatials.items() if v == "Unknown"]
            unknown_modalities = [k for k, v in mapping.modalities.items() if v == "Unknown"]
            
            response_unknown = f"죄송합니다, {unknown_spatials + unknown_modalities}는 존재하지 않는 공간이나 모달리티 입니다."
            return response_unknown, variables, [], query_results


def run_query(user_input, metadata, instructions, exp_tag=None):
    variables = {
        "Metadata": metadata,
    }
    query_results = []
        
    
    for instruction in instructions:
        # logger.debug(f"Executing instruction: {instruction.__class__.__name__}")
        # print(f"Executing instruction: {instruction.__class__.__name__}")
        
        if type(instruction) == InstructionQ:
            # Execute query
            result_df = DBManager.structured_query_data_t(metadata, instruction.args, get_rowids=True)
            # if result_df is None:
                # print("죄송합니다, 관련 데이터를 찾을 수 없습니다.", "response")
                # return

            cols = list(result_df.columns)
            cols.remove("id")
            cols.remove("idu")
            rows = list(result_df["id"])

            query_results.append({
                "result_columns": cols,
                "result_indices": rows,
            })

            # For demo, drop rows where any value is -1
            result_df = result_df.loc[(result_df != -1).all(axis=1)]

            # drop "id" from result_df
            result_df = result_df.drop(columns=['id'])
           
            #pd.set_option('display.max_rows', 10000)        
            #pd.set_option('display.max_columns', 1000)
            #pd.set_option('display.width', 1000)
            #pd.set_option('display.max_colwidth', 1000)
            #print(f"QueryResult: {result_df}")

            variables[instruction.result_name] = result_df
        elif type(instruction) == InstructionQ_raw:
            instruction.query = instruction.query.replace(" FROM \"data_t\"", ", \"id\" FROM \"data_t\"")
            result_df = DBManager.execute_structured_query_string(
                instruction.query
            )
            # rename idu_name to idu
            result_df = result_df.rename(columns={'idu_name': 'idu'})
            
            cols = list(result_df.columns)
            cols.remove("id")
            cols.remove("idu")
            rows = list(result_df["id"])

            query_results.append({
                "result_columns": cols,
                "result_indices": rows,
            })

            # drop "id" from result_df
            result_df = result_df.drop(columns=['id'])
            
            variables[instruction.result_name] = result_df
            # print(result_df, flush=True)

        elif type(instruction) == InstructionO:
            # Execute operation
            # variables_to_report = {k: v for k, v in variables.items() if k not in ["Metadata"]}
            # print(variables_to_report)
            result_dict = OperationExecutor.execute(variables, instruction.scripts)
            # print(instruction.scripts, instruction.returns, result_dict)
            variables.update(result_dict)
            pass
            # print(fig, "graph")
        elif type(instruction) == InstructionR:
            # Execute response generation
            variables_to_report = {k: v for k, v in variables.items() if k not in ["Metadata"]}
            # print(variables_to_report)
            # variables_to_report = ResponseGeneration.stringify_variables(variables_to_report)
            # variables_to_report = summarize_variables_to_report(variables_to_report)

            # print(f"Variables: {variables_to_report}")

            keys_to_leave = ["modality_mapping", "idu_mapping"]
            metadata_ = {}
            for key in metadata.keys():
                if key in keys_to_leave:
                    metadata_[key] = metadata[key]

            response, required_variables = ResponseGeneration.execute(instruction, variables, user_input, metadata_, exp_tag=exp_tag)
            # print(f"Required variables: {required_variables}")
            
            # response = instruction.expectations[0] # "{{var}}..."
            # for var_name, var_value in required_variables.items():
            #     placeholder = f"{{{{{var_name}}}}}"
            #     if placeholder in response:
            #         response = response.replace(placeholder, str(var_value))

            
            return response, variables_to_report, required_variables, query_results

In [25]:
from copy import deepcopy
dataset_name = "v7-250309-reduceinputanddatefunctioncall"
base_dataset_dir = Path(f"{BASE_DIR}/finetuning/dataset/{dataset_name}")

def build_query_groundtruth():
    
    def read(path):
        data = read_json(path)
        for i, d in enumerate(data):
            data[i]["Scenario"] = directory.name
            if "v7" in dataset_name:
                data[i]["Metadata"] = metadata
        return data

    ds_ts = []
    
    for directory in base_dataset_dir.iterdir():
        if directory.is_dir():
            if "v7" in dataset_name:
                metadata = read_json(f"{directory}/metadata.json")
            
            # d = read(f"{directory}/onlyq_ts.json")
            
            ds_ts.extend(read(f"{directory}/onlyq_ts.json"))
            ds_ts.extend(read(f"{directory}/onlyq_tr.json"))
            # ds_tr.extend(read(f"{directory}/graph.json"))
    
    ds = ds_ts
    print(len(ds))
    
    gts = []

    for d in ds:
        cont = False
        tags = d["Tags"]["Style"]
        skip_tags = ["Reason", "Graph", "Unrelated", "Prediction"]
        for st in skip_tags:
            if st in tags:
                cont = True
                break
        if cont:
            continue

        # pbar.set_description(f"Processing {d['Input']}")
        # print("--")
        exp_tag = "v2"
        # print(f"Warning! exp_tag is v2")

        expertLLM_output_token_length = measure_token_count(d['Response'])
        expertLLM_input_token_length = measure_token_count(d['Input'])
        print("Input,", d['Input'], ",expertLLM_output_tlen,",  expertLLM_output_token_length, ",expertLLM_input_tlen,", expertLLM_input_token_length)
        mapping, expectations, required_variables, scripts = InputToInstruction.postprocess_v2(deepcopy(d['Response']), exp_tag=exp_tag)
        user_input, tags, metadata, scenario = d["Input"], d["Tags"], d["Metadata"], d["Scenario"]
        # if user_input != "지금 몇시야?":
        #     continue

        response, variables_to_report, required_variables, query_results = run_query_v2(
            user_input, metadata, mapping, expectations, required_variables, scripts, exp_tag=exp_tag
        )
        print(f"출력: {response}")
        # print({k: (v, type(v)) for k, v in variables_to_report.items()})
        gts.append({
            "Input": user_input,
            "Metadata": metadata,
            "Scenario": scenario,
            "Tags": tags,
            "GT": d['Response'],
            "Response": response,
            # "RequiredVariables": required_variables,
            "QueryResults": query_results,
            # "VariablesToReport": variables_to_report,
        })

    # save to json
    with open(f"./gts.json", "w", encoding="utf-8") as f:
        json.dump(gts, f, ensure_ascii=False, indent=4)
  

In [26]:
def measure_token_count(input: str) -> int:
    return len(tokenizer.encode(str(input)))

# # ResponseGeneration.update_prompt()
build_query_groundtruth()

52
Input, 어제 우리반과 옆반의 설정온도 차이 알려줘 ,expertLLM_output_tlen, 360 ,expertLLM_input_tlen, 11
rg_last_input_token_length, 155 ,rg_last_output_token_length, 39
출력: 어제 우리반의 설정온도(23.00°C)는 옆반의 설정온도(23.00°C)보다 0.00°C 높습니다.
Input, 오늘 우리반과 옆반의 평균 온도차이 알려줘 ,expertLLM_output_tlen, 403 ,expertLLM_input_tlen, 11
rg_last_input_token_length, 193 ,rg_last_output_token_length, 34
출력: 우리반(27.45°C)이 옆반(26.65°C)보다 0.80°C 높습니다.
Input, 작년 겨울 우리반 평균온도 알려줘 ,expertLLM_output_tlen, 268 ,expertLLM_input_tlen, 8
rg_last_input_token_length, 76 ,rg_last_output_token_length, 32
출력: 작년 겨울(2021-12 ~ 2022-02) 우리반의 평균 온도 데이터를 찾을 수 없습니다.
Input, 올해 여름 앞반 평균온도 알려줘 ,expertLLM_output_tlen, 242 ,expertLLM_input_tlen, 8
rg_last_input_token_length, 90 ,rg_last_output_token_length, 25
출력: 올해 여름(6월 ~ 8월) 앞반의 평균 온도는 26.11°C 입니다.
Input, 올해 봄 옆반 제일 추웠던 날 알려줘 ,expertLLM_output_tlen, 370 ,expertLLM_input_tlen, 10
rg_last_input_token_length, 104 ,rg_last_output_token_length, 15
출력: 올해 봄 옆반 실내 최저 온도 데이터를 찾을 수 없습니다.
Input, 4월 앞반 평균온도 알려줘 ,e

# Run inference


In [28]:
import time

from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM

class UnslothInference:
    def __init__(
        self,
        checkpoint_dir: str,
        cache_dir: str,
        max_seq_length: int = 3500,
        attn_implementation: str = attn_implementation,
        batch_size: int = 8  # 배치 크기 매개변수 추가
    ):
        if 'checkpoint' in checkpoint_dir:
            self.checkpoint_dir = Path(checkpoint_dir)
            if not self.checkpoint_dir.exists():
                raise ValueError(f"Checkpoint directory {checkpoint_dir} does not exist")
        else:
            self.checkpoint_dir = checkpoint_dir
        self.cache_dir = Path(cache_dir)
        self.max_seq_length = max_seq_length
        self.attn_implementation = attn_implementation
        self.batch_size = batch_size

        # Verify model files exist
        # if not (self.checkpoint_dir / "config.json").exists():
        #     raise ValueError(f"config.json not found in {checkpoint_dir}")
        
        # Set torch dtype based on GPU capability
        self.torch_dtype = torch_dtype

    def setup_model(self):
        """Initialize model and tokenizer for the given rank."""
        if not hasattr(self, "model"):
            try:
                if isinstance(self.checkpoint_dir, Path):
                    checkpoint_dir = self.checkpoint_dir.as_posix()
                else:
                    checkpoint_dir = self.checkpoint_dir

                self.model, self.tokenizer = FastLanguageModel.from_pretrained(
                    checkpoint_dir,
                    dtype = self.torch_dtype,
                    load_in_4bit = False,
                    load_in_8bit = False,
                    attn_implementation=self.attn_implementation,
                    cache_dir=self.cache_dir.as_posix(),
                    local_files_only=True,
                    device_map="cuda",
                )
                FastLanguageModel.for_inference(self.model)
            
                self.tokenizer.padding_side = "left"
                print(f"Model loaded from {self.checkpoint_dir}")
            except Exception as e:
                print(f"Error in setup_model {str(e)}")
                raise

    @staticmethod
    def extract_content(text: str):
        """Extract content from model output."""
        if "start_header_id" in text:
            pattern = r"<\|start_header_id\|>assistant<\|end_header_id\|>(.*?)<\|eot_id\|>"
        elif "start_of_turn" in text:
            pattern = r"<start_of_turn>model\n(.*?)<eos>"
        elif "im_start" in text:
            # <|im_start|>assistant{"Thinking": "사용자는 오늘 4층에 있는 모든 방의 설정온도의 평균값을 알고 싶어합니다. 4층에 해당하는 idu들(01_IB7, 02_I84, 02_I85)의 오늘 설정온도 데이터를 쿼리한 후 평균값을 계산하여 반환하면 됩니다.", "Expectations": ["오늘 4층의 평균 설정온도는 {{settemp_avg}}℃ 입니다."], "Instructions": [{"type": "q", "args": {"table_name": "data_t", "columns": ["settemp"], "temporal": "[DATE_TRUNC('day', DATE 'CURRENT_DATE'), DATE_TRUNC('day', DATE 'CURRENT_DATE' + INTERVAL '1 day'))", "spatials": ["01_IB7", "02_I84", "02_I85"]}, "result_name": "qr"}, {"type": "o", "script": "settemp_avg = qr['settemp'].mean();", "returns": ["settemp_avg"]}]}<|im_end|>
            pattern = r"<\|im_start\|>assistant\n(.*?)<\|im_end\|>"
        elif "|endofturn|" in text:
            pattern = r"\[\|assistant\|\](.*?)\[\|endofturn\|\]"
        match = re.search(pattern, text, re.DOTALL)
        return match.group(1).strip() if match else None

    def process_batch(
        self,
        batch_data,
        common_prompt,
    ):
        try:
            batch_data = Dataset.from_list(batch_data)
            model: AutoModelForCausalLM = self.model
            tokenizer: AutoTokenizer = self.tokenizer

            convos = []
            for metadata, input in zip(batch_data["Metadata"], batch_data["Input"]):
                if "llama" in model.config.architectures[0].lower():
                    chat = [
                        {"role": "system", "content": common_prompt},
                        {"role": "user", "content": f"Metadata:{metadata};Input:{input};"},
                    ]
                elif "gemma" in model.config.architectures[0].lower():
                    chat = [
                        {"role": "user", "content": f"{common_prompt};{json.dumps(metadata)};{input}"},
                    ]
                else:
                    chat = [
                        {"role": "system", "content": common_prompt},
                        {"role": "user", "content": f"Metadata:{metadata};Input:{input};"},
                    ]
                    # raise ValueError(f"Unsupported model architecture: {model.config.architectures[0]}")
                
                chat = tokenizer.apply_chat_template(
                    chat,
                    tokenize=True,
                    add_generation_prompt=True,
                    return_tensors="pt"
                ).to(model.device)
                convos.append(chat)
            
            max_length = max(inputs.size(1) for inputs in convos)
        
            # 패딩 적용하여 입력 준비
            padded_inputs = []
            attention_masks = []
            
            for inputs in convos:
                pad_length = max_length - inputs.size(1)
                
                if pad_length > 0:
                    # 패딩 추가
                    padded = torch.cat([
                        torch.full((1, pad_length), tokenizer.pad_token_id, device=model.device),
                        inputs,
                    ], dim=1)
                    
                    # 어텐션 마스크 생성 (원본 시퀀스는 1, 패딩은 0)
                    mask = torch.cat([
                        torch.zeros(1, pad_length, device=model.device),
                        torch.ones(1, inputs.size(1), device=model.device),
                    ], dim=1)
                else:
                    padded = inputs
                    mask = torch.ones(1, inputs.size(1), device=model.device)
                
                padded_inputs.append(padded)
                attention_masks.append(mask)
            
            # 배치 텐서 생성
            batch_tensor = torch.cat(padded_inputs, dim=0)
            attention_mask = torch.cat(attention_masks, dim=0)
            # print(batch_tensor)
            # 배치 추론 실행
            outputs = model.generate(
                input_ids=batch_tensor,
                attention_mask=attention_mask,
                max_new_tokens=self.max_seq_length,
                use_cache=True,
                pad_token_id=tokenizer.pad_token_id,
                do_sample=False  # 결정론적 생성
            )
            
            # 결과 디코딩 및 파싱
            responses = tokenizer.batch_decode(outputs, skip_special_tokens=False)
            # print(responses)
            parsed_responses = []
            for response in responses:
                parsed = self.extract_content(response)
                if parsed is None:
                    print(f"Error parsing response: {response[:100]}...")
                    parsed_responses.append(None)
                else:
                    parsed_responses.append(parsed)
            # print(f"Elapsed time: {end_time - start_time:.2f}s")
            return parsed_responses
            
        except Exception as e:
            print(f"Error in process_batch: {str(e)}")
            return [None] * len(batch_data)

    def run(
        self,
        dataset,
        common_prompt: str,
        output_file: str
    ):
        """Run inference in batches."""
            
        # Setup model and tokenizer
        self.setup_model()

        self.model
        # 배치 처리
        
        with tqdm(total=len(dataset)) as pbar:
            for batch_start in range(0, len(dataset), self.batch_size):
                batch_end = min(batch_start + self.batch_size, len(dataset))
                batch_data = dataset[batch_start:batch_end]
                
                # 배치 처리
                start_time = time.time()
                # print(self.batch_size, batch_start, batch_end)
                responses = self.process_batch(
                    batch_data, common_prompt
                )
                time_taken = time.time() - start_time
                print(f"Time taken: {time_taken:.2f}s, Input: {batch_data[0]['Input']}")
                
                # 결과 저장
                for i, response in enumerate(responses):
                    sample = batch_data[i]
                    
                    if response is not None:
                        try:
                            response = eval(response)
                        except Exception as e:
                            print(f"Error in eval: {str(e)}")
                        
                        result = {
                            "Input": sample["Input"],
                            "Scenario": sample["Scenario"],
                            "Metadata": sample["Metadata"],
                            "Candidate": response,
                        }
                        
                        with open(output_file, "a", encoding="utf-8") as f:
                            f.write(json.dumps(result, ensure_ascii=False) + "\n")
                    else:
                        print(f"Error in response for sample {batch_start + i}")
                
                pbar.update(batch_end - batch_start)
                

In [24]:


def read_dataset(dir, path, train_type):

    # the file is originally json-list format
    # we want every first-level elements to be a string itself
    # for example, [{"Hi": "a'b'"}, {"Hi": "c'd'"}] -> ["""{"Hi": "a'b'"}""", """{"Hi": "c'd'"}"""]
    
    metadata = json.load(open(dir / "metadata.json", "r"))

    path = dir / path
    with open(path, "r", encoding="utf-8") as f:
        data = json.loads(f.read())
    result = []
    for d in data:
        if train_type in ["WoThinking", "WoMetadata+Thinking"]:
            del d["Response"]["Thinking"]
        elif train_type in ["woExp"]:
            del d["Response"]["Expectations"]
        
        if "Script" in d["Response"]:
            if train_type in ["woScript", "woQM+Script"]:
                new_scripts = []
                for i, script in enumerate(d["Response"]["Script"]):
                    if "data" in script:
                        new_scripts.append(script)
                d["Response"]["Script"] = new_scripts

            if train_type in ["woQM"]:
                new_scripts = []
                for i, script in enumerate(d["Response"]["Script"]):
                    if "data" in script:
                        new_scripts.append(script)
                    else:
                        for m, n in [("실내온도", "roomtemp"), ("설정온도", "settemp")]:
                            script = script.replace(f"'{m}'", f"'{n}'")
                        
                        new_scripts.append(script)
                d["Response"]["Script"] = new_scripts

            if train_type in ["woQM", "woQM+Script"]:
                mapping = d["Response"]["Mapping"]
                for i, script in enumerate(d["Response"]["Script"]):
                    if "data" not in script:
                        continue

                    t_match = re.search(r"t=('[^']+'|\[[^\]]+\])", script)
                    s_match = re.search(r"s=('[^']+'|\[[^\]]+\])", script)
                    m_match = re.search(r"m=('[^']+'|\[[^\]]+\])", script)
                    t = eval(t_match.group(1)) if t_match else None
                    s = eval(s_match.group(1)) if s_match else None
                    m = eval(m_match.group(1)) if m_match else None
                    
                    if isinstance(t, str):
                        t = [t]
                    if isinstance(s, str):
                        s = [s]
                    if isinstance(m, str):
                        m = [m]

                    t_raw = [mapping['temporal'][t_highlevel] for t_highlevel in t]
                    s_raw = [mapping['spatials'][s_highlevel] for s_highlevel in s]
                    m_raw = [mapping['modalities'][m_highlevel] for m_highlevel in m]
                    s_raw = [item for sublist in s_raw for item in (sublist if isinstance(sublist, list) else [sublist])]
                    sql = DBManager.get_query_strings_v2(
                        metadata, m_raw, t_raw, s_raw
                    )
                    sql = normalize_sql_dates(sql)
                    # replace data(...) with sql using regex
                    d["Response"]["Script"][i] = re.sub(r"data\(([^)]+)\)", lambda x: f"\"{sql}\"", script)
                del d["Response"]["Mapping"]
            #     # raise NotImplementedError
            # elif train_type in ["woOp"]:
            #     instructions = d["Response"]["Instructions"]
            #     d["Response"]["Instructions"] = [i for i in instructions if i["type"] == "q"]

        tags = d["Tags"]["Style"]

        skip_tags = ["Reason", "Graph", "Unrelated", "Prediction"]

        skip = False
        for skip_tag in skip_tags:
            if skip_tag in tags:
                skip = True
                break
        
        if skip:
            continue
        
        
        result.append({"Metadata": metadata, "Input": d["Input"], "Scenarios": dir.name, "Response": json.dumps(d["Response"], ensure_ascii=False)})
        # result = [{"Input": d["Input"], "Response": json.dumps(d["Response"], ensure_ascii=False)} for d in data]
        # print(f"Read {len(result)} examples from {path}")
        # print(f"Type of result: {type(result)}")
        # print(f"Type of result[0]: {type(result[0])}")
        # print(f"Type of result[0]['Input']: {type(result[0]['Input'])}")
        # print(f"Type of result[0]['Response']: {type(result[0]['Response'])}")
    return result

def sub(name, common_prompt):
    # Remove the section between <|name|> ... <|name|> including the tags themselves
    # Use re.DOTALL to match newlines with '.'
    pattern = rf"\n?<\|{name}\|>[\s\S]*?<\|{name}\|>"
    common_prompt = re.sub(pattern, "", common_prompt, flags=re.DOTALL)
    return common_prompt

def run_inference(checkpoint_number, train_type):
    # ---
    dataset = []
    for scenario_dir in [d for d in base_dataset_dir.iterdir() if d.is_dir() and "scenario" in d.name and "metadata.json" in [f.name for f in d.iterdir()]]:
        data = read_dataset(scenario_dir, "onlyq_ts.json", train_type)
        for i, d in enumerate(data):
            data[i]["Scenario"] = scenario_dir.name
        dataset.extend(data)

        data = read_dataset(scenario_dir, "onlyq_tr.json", train_type)
        for i, d in enumerate(data):
            data[i]["Scenario"] = scenario_dir.name
        dataset.extend(data)

    # ---
    common_prompt = open(base_dataset_dir / f"prompt.txt", "r").read()
    
    sub_targets = []
    if train_type == "ours":
        sub_targets = []
    elif train_type == "BASE":
        sub_targets = ["Thinking", "Expectation", "Mapping", "Script", "Examples"]
    elif train_type in ["WoThinking"]:
        sub_targets = ["Thinking"]
    elif train_type in ["woMetadata"]:
        sub_targets = ["Metadata"]
    elif train_type in ["WoMetadata+Thinking"]:
        sub_targets = ["Metadata", "Thinking"]
    elif train_type in ["woExp"]:
        sub_targets = ["Expectation"]

    if train_type in ["woQM", "woQM+Script"]:
        sub_targets = ["QM", "Mapping"]
    if train_type in ["woScript", "woQM+Script"]:
        sub_targets = ["Script"]

    for sub_target in sub_targets:
        common_prompt = sub(sub_target, common_prompt)

    # remove all <||>
    common_prompt = re.sub(r"<\|.*?\|>", "", common_prompt)

    if train_type in ["0SL","5SL", "ALLSL"]:
        # n-shot prompting with trainset
        
        datas = []
        for directory in base_dataset_dir.iterdir():
            if directory.is_dir():
                # Note: metadata 안넣음
                data = read_json(f"{directory}/onlyq_tr.json")
                for d in data:
                    del d["Tags"]
                datas.extend(data)

                data = read_json(f"{directory}/onlyq_ts.json")
                for d in data:
                    del d["Tags"]
                datas.extend(data)
        if "ALL" not in train_type:
            n = int(train_type.split("SL")[0])
            datas = datas[:n]

        # n-shot prompting with testset
        if train_type != "0SL": 
            data_str = "\n[예시]\n" + "\n".join([f"입력: {d['Input']}\n출력: {d['Response']}" for d in datas])
            common_prompt = common_prompt + data_str
    print(common_prompt)
    # ---
    model_name = "sh2orc-Llama-3.1-Korean-8B-Instruct"
    model_dir = Path(f"/model/{model_name}")
    cache_dir = Path(f"{model_dir}/cache")
    
    checkpoint_dir = None
    if checkpoint_number == 0:
        checkpoint_dir = "sh2orc/Llama-3.1-Korean-8B-Instruct"
        output_file = f"../experiments/r-v7_not_trained_{train_type}_tr27_0629.json"
        max_seq_length = 3000
    else:
        r = 211
        tr_dir = f"v7_r{r}_a{2*r}_{train_type}_tr27_0629"

        checkpoint_dir = Path(f"{model_dir}/chkpts/{tr_dir}")
        print(checkpoint_dir)

        checkpoint_dir = sorted(checkpoint_dir.iterdir(), key=lambda x: int(x.name.split("-")[-1]))[-1]
        # tr_config = f"{tr_dir}/{checkpoint_dir.name}"
        tr_config = f"{tr_dir}/checkpoint-{checkpoint_number}"
        print(tr_config)
        checkpoint_dir = Path(f"{model_dir}/chkpts/{tr_config}")
        
        print(f"Model: {model_name}, Config: {tr_config}")

    
        # Verify paths exist
        if not checkpoint_dir.exists():
            raise ValueError(f"Checkpoint directory {checkpoint_dir} does not exist")

        output_file = f"../experiments/r-{tr_config.replace('/', '-')}.json"
        max_seq_length = 10000
    

    batch_size = 1
    inference = UnslothInference(
        checkpoint_dir=str(checkpoint_dir),
        cache_dir=str(cache_dir),
        batch_size=batch_size,
        max_seq_length=max_seq_length
    )

    open(output_file, "w").close()  # Clea

    inference.run(
        dataset=dataset,
        common_prompt=common_prompt,
        output_file=output_file
    )

    with open(output_file, "r") as f:
        lines = f.readlines()
    with open(output_file, "w") as f:
        f.write("[\n")
        f.write(",\n".join(lines))
        f.write("\n]\n")
    
    print(f"Saved to {output_file}")

    del inference

In [29]:
# # ResponseGeneration.update_prompt()
# # 여러 모델에 대해 추론 실행 (Run inference for multiple models)
model_configs = [
    (25, "ours"),
    # (55, "woExp"),
    # (55, "WoMetadata"),
    # (55, "WoMetadata+Thinking"),
    # (55, "WoThinking"),
    # (55, "woQM"),
    # (55, "woQM+Script"),
    # (55, "woScript")
]

for checkpoint_num, config_name in model_configs:
    print(f"Running inference for {config_name} with checkpoint {checkpoint_num}")
    run_inference(checkpoint_num, config_name)


Running inference for ours with checkpoint 25
너는 유저의 HVAC 관련 질문에 답변하는 Agent의 계획을 설계하는 정확하고 훌룡한 인공지능이다. 
사용자의 질문(Input)을 받아 Agent의 Instructions를 출력해 주어야 한다.
Thinking에서는 HVAC 상식을 바탕으로 유저의 의도를 추측해 빠진 context를 추측하여 질문의 모호함을 없앤 완벽한 형태의 질문을 출력하고, 이에 대한 답변 계획을 세워야함.
Expectation에서는 유저가 기대할만한 답변을 추측해야함.
Mapping에서는 질문에 사용된 high-level taxonomy를 metadata를 바탕으로 low-level taxonomy로의 mapping을 계산하여야 함.
이때 metadata에 없는 정보를 기술하는 등 거짓된 출력을 하면 안되고, 모르는 정보가 있으면 Unknown이라 답변해야함.
Script에서는 data 함수를 이용하여 pandas dataframe 형식의 데이터를 쿼리하여 답변에 필요한 연산을 python 스크립트를 짜 수행한다. 이때 실행 에러에 조심한다. python
Expectation의 명시된 모든 variable이 script에서 계산되야 한다.
json형식으로 출력하며, eval() 함수를 사용할 수 있도록 괄호들과 따옴표들의 순서와 닫힘을 매우 신경써서 출력해야한다.
/model/sh2orc-Llama-3.1-Korean-8B-Instruct/chkpts/v7_r211_a422_ours_tr27_0629
v7_r211_a422_ours_tr27_0629/checkpoint-25
Model: sh2orc-Llama-3.1-Korean-8B-Instruct, Config: v7_r211_a422_ours_tr27_0629/checkpoint-25
==((====))==  Unsloth 2025.6.11: Fast Llama patching. Transformers: 4.53.0.
   \\   /|    NVID

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model loaded from /model/sh2orc-Llama-3.1-Korean-8B-Instruct/chkpts/v7_r211_a422_ours_tr27_0629/checkpoint-25


  0%|          | 0/52 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
  2%|▏         | 1/52 [00:09<07:51,  9.24s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 9.24s, Input: 어제 우리반과 옆반의 설정온도 차이 알려줘


  4%|▍         | 2/52 [00:19<07:58,  9.57s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 9.80s, Input: 오늘 우리반과 옆반의 평균 온도차이 알려줘


  6%|▌         | 3/52 [00:24<06:18,  7.72s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 5.52s, Input: 작년 겨울 우리반 평균온도 알려줘


  8%|▊         | 4/52 [00:30<05:38,  7.05s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 6.03s, Input: 올해 여름 앞반 평균온도 알려줘


 10%|▉         | 5/52 [00:40<06:19,  8.07s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 9.86s, Input: 올해 봄 옆반 제일 추웠던 날 알려줘


 12%|█▏        | 6/52 [00:45<05:28,  7.15s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 5.36s, Input: 4월 앞반 평균온도 알려줘


 13%|█▎        | 7/52 [00:53<05:34,  7.43s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 8.01s, Input: 이번달 중 우리반 온도가 가장 덜 더운날이 언제야?


 15%|█▌        | 8/52 [01:04<06:06,  8.34s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 10.28s, Input: 2주전 우리반과 옆반 합쳐서 설정온도가 가장 낮은날이 언제야?


 17%|█▋        | 9/52 [01:08<05:06,  7.13s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 4.48s, Input: 이번달 중 뒷반 온도가 가장 더운날이 언제야?


 19%|█▉        | 10/52 [01:12<04:22,  6.26s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 4.30s, Input: 우리반의 현재 설정 온도 알려줘


 21%|██        | 11/52 [01:18<04:09,  6.08s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 5.67s, Input: 8일전 설정온도는?


 23%|██▎       | 12/52 [01:24<04:01,  6.03s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 5.93s, Input: 10년 전 오늘 우리반 온도는?


 25%|██▌       | 13/52 [01:27<03:17,  5.06s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 2.81s, Input: 롯데캐슬의 현재 온도 알려줘


 27%|██▋       | 14/52 [01:32<03:09,  4.98s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 4.81s, Input: 지금 4층 최대 설정온도 알려줘


 29%|██▉       | 15/52 [01:40<03:38,  5.92s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 8.08s, Input: 우리반, 옆반, 앞반 중 가장 추운 방은?


 31%|███       | 16/52 [01:43<02:59,  4.99s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 2.84s, Input: 지금 에너지 사용량 알려줘


 33%|███▎      | 17/52 [01:49<03:06,  5.32s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 6.07s, Input: 지난달 오늘 오후 2시에 옆반의 설정온도는 어땠어?


 35%|███▍      | 18/52 [01:54<03:06,  5.48s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 5.85s, Input: 옆반의 현재 온도랑 설정온도 알려줘


 37%|███▋      | 19/52 [02:00<02:59,  5.45s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 5.38s, Input: 이번주 우리반 평균 온도 알려줘


 38%|███▊      | 20/52 [02:05<02:53,  5.41s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 5.32s, Input: 지난달 설정온도 평균을 알려줘.


 40%|████      | 21/52 [02:08<02:22,  4.59s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 2.69s, Input: 1층 평균 실내온도 알려줘


 42%|████▏     | 22/52 [02:17<02:58,  5.95s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 9.10s, Input: 우리반 이번달 제일 추웠던 날은 언제냐?


 44%|████▍     | 23/52 [02:26<03:18,  6.84s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 8.94s, Input: 작년 옆반 가장 더웠던 달은?


 46%|████▌     | 24/52 [02:32<03:01,  6.48s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 5.62s, Input: 오늘 오전 11시에 옆반의 실내온도는 어땠어?


 48%|████▊     | 25/52 [02:38<02:56,  6.55s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 6.72s, Input: 오늘 오후 4시부터 6시까지 실내온도 평균 알려줘


 50%|█████     | 26/52 [02:51<03:38,  8.39s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 12.69s, Input: 지난주에 설정온도와 실내온도 차이가 가장 많이 났던 날은?


 52%|█████▏    | 27/52 [03:13<05:16, 12.65s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 22.57s, Input: 우리반과 옆반중 더 추운곳은 어디야?


 54%|█████▍    | 28/52 [03:22<04:32, 11.36s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 8.37s, Input: 이번주 우리반과 앞반의 평균 온도 알려줘


 56%|█████▌    | 29/52 [03:30<03:57, 10.31s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 7.84s, Input: 현재 설정온도랑 실내온도 차이 알려줘.


 58%|█████▊    | 30/52 [03:52<05:07, 14.00s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 22.60s, Input: 지난달에 설정온도와 실내온도 차이가 가장 많이 났던 날은?


 60%|█████▉    | 31/52 [04:03<04:31, 12.92s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 10.40s, Input: 이번주 우리반과 옆반의 평균 실내온도 차이 알려줘


 62%|██████▏   | 32/52 [04:10<03:42, 11.11s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 6.89s, Input: 2주전 가장 더웠던 날 알려줘


 63%|██████▎   | 33/52 [04:14<02:52,  9.08s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 4.35s, Input: 화성의 설정온도 확인해줘


 65%|██████▌   | 34/52 [04:17<02:08,  7.13s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 2.58s, Input: 옆반 습도 알려줘


 67%|██████▋   | 35/52 [04:22<01:53,  6.66s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 5.57s, Input: 지난 3일 동안 우리반 실내 온도 평균 값 알려줘.


 69%|██████▉   | 36/52 [04:28<01:41,  6.32s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 5.51s, Input: 오늘 오후 5시에 옆반의 설정온도는 어땠어?


 71%|███████   | 37/52 [04:35<01:40,  6.68s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 7.54s, Input: 올해 여름 우리반 실내온도 최대값과 최소값 알려줘


 73%|███████▎  | 38/52 [04:52<02:15,  9.67s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 16.63s, Input: 우리반과 앞반 중 가장 더운 방은?


 75%|███████▌  | 39/52 [04:55<01:39,  7.67s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 3.00s, Input: 지금 4층 평균 실내온도 알려줘


 77%|███████▋  | 40/52 [05:03<01:34,  7.89s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 8.42s, Input: 이번주 우리반과 앞반의 평균 온도 알려줘


 79%|███████▉  | 41/52 [05:11<01:26,  7.86s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 7.78s, Input: 현재 설정온도랑 실내온도 차이 알려줘.


 81%|████████  | 42/52 [05:23<01:30,  9.06s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 11.85s, Input: 지난달에 설정온도와 실내온도 차이가 가장 많이 났던 날은?
Error in eval: closing parenthesis '}' does not match opening parenthesis '[' (<string>, line 1)


 83%|████████▎ | 43/52 [05:33<01:25,  9.46s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 10.41s, Input: 이번주 우리반과 옆반의 평균 실내온도 차이 알려줘


 85%|████████▍ | 44/52 [05:45<01:20, 10.08s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 11.52s, Input: 2주전 가장 더웠던 날 알려줘


 87%|████████▋ | 45/52 [05:47<00:54,  7.82s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 2.55s, Input: 화성의 설정온도 확인해줘


 88%|████████▊ | 46/52 [05:50<00:37,  6.22s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 2.49s, Input: 옆반 습도 알려줘


 90%|█████████ | 47/52 [05:55<00:30,  6.04s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 5.61s, Input: 지난 3일 동안 우리반 실내 온도 평균 값 알려줘.


 92%|█████████▏| 48/52 [06:01<00:23,  5.86s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 5.45s, Input: 오늘 오후 5시에 옆반의 설정온도는 어땠어?


 94%|█████████▍| 49/52 [06:09<00:19,  6.43s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 7.75s, Input: 올해 여름 우리반 실내온도 최대값과 최소값 알려줘


 96%|█████████▌| 50/52 [06:21<00:16,  8.29s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 12.62s, Input: 우리반과 앞반 중 가장 더운 방은?


 98%|█████████▊| 51/52 [06:25<00:07,  7.05s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Time taken: 4.15s, Input: 현재 4층에서 온도 22℃ 이상인 방들 알려줘


100%|██████████| 52/52 [06:34<00:00,  7.59s/it]

Time taken: 8.75s, Input: 3주전 온도가 22℃ 이상이었던 날짜 알려줘
Saved to ../experiments/r-v7_r211_a422_ours_tr27_0629-checkpoint-25.json





# Eval query

In [31]:
import time
from typing import Any  # Any 타입 import 필요

class EM:
    json_structure = "JsonStructureCorrectness"
    true_positive = "QueryTruePositive"
    false_positive = "QueryFalsePositive"
    false_negative = "QueryFalseNegative"
    
def eval_query(cand_response_filename, db_gt_filename="./gts.json"):
    db_gts = read_json(db_gt_filename)
    cand_responses = read_json(cand_response_filename)
    # metadata_ = read_json(f"{BASE_DIR}/finetuning/dataset/v7-250309-reduceinputanddatefunctioncall/scenario1/metadata.json")
    evaluation_reports = []
    response_reports = []
    time_reports = []
    with tqdm(total=len(cand_responses)) as pbar:
        for cand_response in cand_responses:
            # pbar.set_description(f"Processing {cand_response['Input']}")
            input = cand_response["Input"]
            scenario = cand_response["Scenario"]

            # if "옆반 습도 알려줘" not in input:
            #     continue

            if "Metadata" in cand_response:
                metadata = cand_response["Metadata"]
            else:
                # metadata = metadata_
                metadata = None
            # 관계 없는 질문들은 건너뛰자
            gt_report = [d for d in db_gts if d["Input"] == input and d["Scenario"] == scenario]
            assert len(gt_report) <= 1
            if len(gt_report) == 0:
                print(f"No ground truth found for {input}")
                pbar.update(1)
                continue

            gt_report = gt_report[0]
            tags = gt_report["Tags"]
            # assert gt_report["QueryResults"] != []
            # if gt_report["Result"] == []:
            #     pbar.update(1)
            #     continue
            
            
            gt_results = [d for d in gt_report["QueryResults"]]
            gt_query_results = defaultdict(list)
            for gt_result in gt_results:
                for col in gt_result["result_columns"]:
                    gt_query_results[col].extend(gt_result["result_indices"])

            gt_total_combinations = sum(len(v) for v in gt_query_results.values())

            gt_response = gt_report["Response"]
            # gt_required_variables = gt_report["RequiredVariables"]
            # gt_variables_to_report = gt_report["VariablesToReport"]
            user_input = gt_report["Input"]
            # print(user_input)
            exp_tag = cand_response_filename.split("/")[-1].split("_")[3]

            response_report = {
                "Input": user_input,
                "Metadata": metadata,
                "GT_Response": gt_response,
                # "GT_RequiredVariables": gt_required_variables,
                # "GT_VariablesToReport": gt_variables_to_report,
            }
            # evaluation_report 딕셔너리 생성 (defaultdict 사용, 기본값 None)

            evaluation_report: dict[str, Any] = defaultdict(lambda: None)
            evaluation_report["Input"] = input
            evaluation_report["Metadata"] = metadata
            evaluation_report["Tags"] = tags

            
            
            if isinstance(cand_response["Candidate"], dict):
                requirements = ["Thinking", "Expectations", "Mapping"]
                if exp_tag in ["WoThinking", "WoMetadata+Thinking"]:
                    requirements.remove("Thinking")
                elif exp_tag in ["woExp"]:
                    requirements.remove("Expectations")
                elif exp_tag in ["woQM", "woQM+Script"]:
                    requirements.remove("Mapping")
                for requirement in requirements:
                    if requirement not in cand_response["Candidate"]:
                        evaluation_report[EM.json_structure] = False
                        break
                else:
                    evaluation_report[EM.json_structure] = True
            else:
                evaluation_report[EM.json_structure] = False
            
            if not evaluation_report[EM.json_structure]:
                evaluation_report[EM.true_positive] = 0
                evaluation_report[EM.false_positive] = 0
                evaluation_report[EM.false_negative] = gt_total_combinations

                print("Failed to parse input: ", input, cand_response["Candidate"])
                evaluation_reports.append(evaluation_report)
                pbar.update(1)
                response_reports.append(response_report)
                continue
            
            start_time = time.time()
            expertLLM_output_token_length = measure_token_count(cand_response["Candidate"])
            print("Input,", cand_response["Input"], ",expertLLM_output_tlen,",  expertLLM_output_token_length)

            if exp_tag in ["woExp"]:
                cand_response["Candidate"]["Expectations"] = []
            if exp_tag in ["woQM", "woQM+Script"]:
                pass
            # exp_tag = \
            #     "woCoTExp" if "woCoTExp" in str(cand_response_filename) else \
            #     "woOp" if "woOp" in str(cand_response_filename) else \
            #     "woQM" if "woQM" in str(cand_response_filename) else \
            #     None
            try:
                mapping, expectations, required_variables, script = InputToInstruction.postprocess_v2(
                    deepcopy(cand_response["Candidate"]), 
                    exp_tag=exp_tag
                )
            except:
                evaluation_report[EM.true_positive] = 0
                evaluation_report[EM.false_positive] = 0
                evaluation_report[EM.false_negative] = gt_total_combinations
                            
                evaluation_reports.append(evaluation_report)
                pbar.update(1)
                response_reports.append(response_report)
                continue
            

            
            response, variables_to_report, required_variables, _cand_query_results = run_query_v2(user_input, metadata, mapping, expectations, required_variables, script, exp_tag=exp_tag)
            # print(response)
            response_report["PD_Response"] = response
            # try:
            #     # response, variables_to_report, required_variables, _cand_query_results = run_query_v2(user_input, metadata, instructions, exp_tag=exp_tag)
            # except Exception as e:
            #     print(f"Error: {e}")
            #     # evaluation_report[EM.true_positive] = 0
            #     # evaluation_report[EM.false_positive] = 0
            #     # evaluation_report[EM.false_negative] = gt_total_combinations
                            
            #     # evaluation_reports.append(evaluation_report)

            #     # response_reports.append(response_report)
                            
            #     # pbar.update(1)
            #     # continue
            time_reports.append(time.time() - start_time)
            response_reports.append(response_report)
            
            # required_variables = summarize_variables_to_report(required_variables)
            # print(required_variables)
            # required_variables = ResponseGeneration.stringify_variables(required_variables)
            
            # response_report["PD_RequiredVariables"] = required_variables
            # response_report["PD_VariablesToReport"] = variables_to_report

            if len(_cand_query_results) == 0:
                evaluation_report[EM.true_positive] = 0
                evaluation_report[EM.false_positive] = 0
                evaluation_report[EM.false_negative] = gt_total_combinations
                            
                evaluation_reports.append(evaluation_report)
                pbar.update(1)
                continue
            
            cand_query_results = defaultdict(list)
            for cand_query_result in _cand_query_results:
                for col in cand_query_result["result_columns"]:
                    cand_query_results[col].extend(cand_query_result["result_indices"])

            cand_total_combinations = sum(len(v) for v in gt_query_results.values())

            if len(gt_results) == 0:
                evaluation_report[EM.true_positive] = 0
                evaluation_report[EM.false_positive] = cand_total_combinations
                evaluation_report[EM.false_negative] = 0

                evaluation_reports.append(evaluation_report)
                pbar.update(1)

                continue
            
            # print(gt_total_combinations, cand_total_combinations)
            # True Positive: 공통된 컬럼과 로우의 모든 조합
            true_positive = 0
            false_negative = 0
            false_positive = 0
            for col in set(gt_query_results.keys())&set(cand_query_results.keys()):
                s_gt_query_result = set(gt_query_results[col])
                s_cand_query_result = set(cand_query_results[col])
                true_positive += len(s_gt_query_result & s_cand_query_result)
                false_negative += len(s_gt_query_result - s_cand_query_result)
                false_positive += len(s_cand_query_result - s_gt_query_result)

                # print(true_positive, false_negative, false_positive, len(s_gt_query_result), len(s_cand_query_result))
            # assert true_positive + false_positive + false_negative == gt_total_combinations
            

            evaluation_report[EM.true_positive] = true_positive
            evaluation_report[EM.false_positive] = false_positive
            evaluation_report[EM.false_negative] = false_negative

            evaluation_reports.append(evaluation_report)
            # print(evaluation_report)
            
            pbar.update(1)

    with open(f"{cand_response_filename.replace('.json', '_response.json')}", "w", encoding="utf-8") as f:
        json.dump(response_reports, f, ensure_ascii=False, indent=4)

    print(f"Time: {time_reports}, {sum(time_reports) / len(time_reports)}")

    eval_df = pd.DataFrame(evaluation_reports)
    # print(eval_df)

    eval_df['ExactMatch'] = eval_df.apply(lambda x: x[EM.false_positive] == 0 and x[EM.false_negative] == 0, axis=1).astype(int)
    # eval_df['TruePositive'] = eval_df['TruePositive'].astype(int)
    # eval_df['FalsePositive'] = eval_df['FalsePositive'].astype(int)
    # eval_df['FalseNegative'] = eval_df['FalseNegative'].astype(int)

    final_result = {}

    for col in ["JsonStructureCorrectness", "ExactMatch"]:
        # print(f"{col}: {eval_df[col].mean()}")
        final_result[col] = eval_df[col].mean()
    
    # normalize per query
    eval_df["Total"] = eval_df[EM.true_positive] + eval_df[EM.false_positive] + eval_df[EM.false_negative]
    eval_print = eval_df.drop(columns=["Metadata", "Tags"])
    print(eval_print)
    eval_df[EM.true_positive] = eval_df[EM.true_positive] / eval_df["Total"]
    eval_df[EM.false_positive] = eval_df[EM.false_positive] / eval_df["Total"]
    eval_df[EM.false_negative] = eval_df[EM.false_negative] / eval_df["Total"]

    # # replace nan with 0
    # eval_df.fillna(0, inplace=True)

    # # F1 score except nans.
    truepos_sum, falsepos_sum, falseneg_sum = eval_df[EM.true_positive].sum(), eval_df[EM.false_positive].sum(), eval_df[EM.false_negative].sum()
    precision = truepos_sum / (truepos_sum + falsepos_sum)
    recall = truepos_sum / (truepos_sum + falseneg_sum)
    print(truepos_sum, falsepos_sum, falseneg_sum)
    print(precision, recall)
    f1 = 2 * (precision * recall) / (precision + recall)
    # print(f"F1: {f1}")
    final_result["F1"] = f1
    final_result["Recall"] = recall

    for col in final_result:
        print(f"{col}: {final_result[col]:.2f}")
    
    return eval_df

# tag

In [30]:
def eval_tag(cand_response_filename, db_gt_filename="./gts.json"):
    db_gts = read_json(db_gt_filename)
    cand_responses = read_json(cand_response_filename)
    # metadata_ = read_json(f"{BASE_DIR}/finetuning/dataset/v7-250309-reduceinputanddatefunctioncall/scenario1/metadata.json")
    evaluation_reports = []
    response_reports = []
    with tqdm(total=len(cand_responses)) as pbar:
        for cand_response in cand_responses:
            # pbar.set_description(f"Processing {cand_response['Input']}")
            input = cand_response["Input"]
            scenario = cand_response["Scenario"]

            # if "옆반 습도 알려줘" not in input:
            #     continue

            if "Metadata" in cand_response:
                metadata = cand_response["Metadata"]
            else:
                # metadata = metadata_
                metadata = None
            # 관계 없는 질문들은 건너뛰자
            gt_report = [d for d in db_gts if d["Input"] == input and d["Scenario"] == scenario]
            assert len(gt_report) <= 1
            if len(gt_report) == 0:
                print(f"No ground truth found for {input}")
                pbar.update(1)
                continue

            gt_report = gt_report[0]
            tags = gt_report["Tags"]
            # assert gt_report["QueryResults"] != []
            # if gt_report["Result"] == []:
            #     pbar.update(1)
            #     continue
            
            
            gt_results = [d for d in gt_report["QueryResults"]]
            gt_query_results = defaultdict(list)
            for gt_result in gt_results:
                for col in gt_result["result_columns"]:
                    gt_query_results[col].extend(gt_result["result_indices"])

            gt_total_combinations = sum(len(v) for v in gt_query_results.values())

            gt_response = gt_report["Response"]
            # gt_required_variables = gt_report["RequiredVariables"]
            # gt_variables_to_report = gt_report["VariablesToReport"]
            user_input = gt_report["Input"]
            # print(user_input)
            exp_tag = cand_response_filename.split("_")[3]

            response_report = {
                "Input": user_input,
                "Metadata": metadata,
                "GT_Response": gt_response,
                # "GT_RequiredVariables": gt_required_variables,
                # "GT_VariablesToReport": gt_variables_to_report,
            }
            # evaluation_report 딕셔너리 생성 (defaultdict 사용, 기본값 None)

            evaluation_report: dict[str, Any] = defaultdict(lambda: None)
            evaluation_report["Input"] = input
            evaluation_report["Metadata"] = metadata
            evaluation_report["Tags"] = tags

            print(cand_response)
            evaluation_report[EM.json_structure] = True

            sql = cand_response["Candidate"]
            if sql == "":
                evaluation_report[EM.true_positive] = 0
                evaluation_report[EM.false_positive] = 0
                evaluation_report[EM.false_negative] = gt_total_combinations
                evaluation_reports.append(evaluation_report)
                pbar.update(1)
                response_reports.append(response_report)
                continue

            # "SELECT"라는 첫 번째 등장만 "SELECT id "로 대체합니다.
            # sql = sql.replace("SELECT", "SELECT d.id, ", 1)
            try:
                df = DBManager.execute_structured_query_string(sql)
            except Exception as e:
                print(f"Error: {e}")
                response_report["PD_Response"] = "쿼리중 에러가 발생했습니다"
                pbar.update(1)
                response_reports.append(response_report)
                continue
            # cols = list(df.columns)
            # cols.remove("id")
            # cols.remove("idu_name")
            # cols.remove("timestamp")
            # rows = list(df["id"])
            # query_results = [{
            #     "result_columns": cols,
            #     "result_indices": rows,
            # }]
            # df = df.drop(columns=['id'])

            response = ResponseGeneration.execute_raw(
                f"""
                Input: {user_input}
                Metadata: {metadata}
                Data: {df.to_json(orient="records")}
                """,
                prompt = """
                질문을 주어진 데이터를 바탕으로 답변해줘.
                """
            )
            response = extract_content(response)
            response_report["PD_Response"] = response
            # try:
            #     # response, variables_to_report, required_variables, _cand_query_results = run_query_v2(user_input, metadata, instructions, exp_tag=exp_tag)
            # except Exception as e:
            #     print(f"Error: {e}")
            #     # evaluation_report[EM.true_positive] = 0
            #     # evaluation_report[EM.false_positive] = 0
            #     # evaluation_report[EM.false_negative] = gt_total_combinations
                            
            #     # evaluation_reports.append(evaluation_report)

            #     # response_reports.append(response_report)
                            
            #     # pbar.update(1)
            #     # continue
            
            response_reports.append(response_report)
            
            # cand_query_results = defaultdict(list)
            # for cand_query_result in query_results:
            #     for col in cand_query_result["result_columns"]:
            #         cand_query_results[col].extend(cand_query_result["result_indices"])

            # cand_total_combinations = sum(len(v) for v in gt_query_results.values())

            # if len(gt_results) == 0:
            #     evaluation_report[EM.true_positive] = 0
            #     evaluation_report[EM.false_positive] = cand_total_combinations
            #     evaluation_report[EM.false_negative] = 0

            #     evaluation_reports.append(evaluation_report)
            #     pbar.update(1)

            #     continue
            
            # # print(gt_total_combinations, cand_total_combinations)
            # # True Positive: 공통된 컬럼과 로우의 모든 조합
            # true_positive = 0
            # false_negative = 0
            # false_positive = 0
            # for col in set(gt_query_results.keys())&set(cand_query_results.keys()):
            #     s_gt_query_result = set(gt_query_results[col])
            #     s_cand_query_result = set(cand_query_results[col])
            #     true_positive += len(s_gt_query_result & s_cand_query_result)
            #     false_negative += len(s_gt_query_result - s_cand_query_result)
            #     false_positive += len(s_cand_query_result - s_gt_query_result)

            #     # print(true_positive, false_negative, false_positive, len(s_gt_query_result), len(s_cand_query_result))
            # # assert true_positive + false_positive + false_negative == gt_total_combinations
            

            # evaluation_report[EM.true_positive] = true_positive
            # evaluation_report[EM.false_positive] = false_positive
            # evaluation_report[EM.false_negative] = false_negative

            evaluation_reports.append(evaluation_report)
            # print(evaluation_report)
            
            pbar.update(1)

    with open(f"{cand_response_filename.replace('.json', '_response.json')}", "w", encoding="utf-8") as f:
        json.dump(response_reports, f, ensure_ascii=False, indent=4)

    # eval_df = pd.DataFrame(evaluation_reports)
    # # print(eval_df)

    # eval_df['ExactMatch'] = eval_df.apply(lambda x: x[EM.false_positive] == 0 and x[EM.false_negative] == 0, axis=1).astype(int)
    # # eval_df['TruePositive'] = eval_df['TruePositive'].astype(int)
    # # eval_df['FalsePositive'] = eval_df['FalsePositive'].astype(int)
    # # eval_df['FalseNegative'] = eval_df['FalseNegative'].astype(int)

    # final_result = {}

    # for col in ["JsonStructureCorrectness", "ExactMatch"]:
    #     # print(f"{col}: {eval_df[col].mean()}")
    #     final_result[col] = eval_df[col].mean()
    
    # # normalize per query
    # eval_df["Total"] = eval_df[EM.true_positive] + eval_df[EM.false_positive] + eval_df[EM.false_negative]
    # eval_print = eval_df.drop(columns=["Metadata", "Tags"])
    # print(eval_print)
    # eval_df[EM.true_positive] = eval_df[EM.true_positive] / eval_df["Total"]
    # eval_df[EM.false_positive] = eval_df[EM.false_positive] / eval_df["Total"]
    # eval_df[EM.false_negative] = eval_df[EM.false_negative] / eval_df["Total"]

    # # # replace nan with 0
    # # eval_df.fillna(0, inplace=True)

    # # # F1 score except nans.
    # truepos_sum, falsepos_sum, falseneg_sum = eval_df[EM.true_positive].sum(), eval_df[EM.false_positive].sum(), eval_df[EM.false_negative].sum()
    # precision = truepos_sum / (truepos_sum + falsepos_sum)
    # recall = truepos_sum / (truepos_sum + falseneg_sum)
    # print(truepos_sum, falsepos_sum, falseneg_sum)
    # print(precision, recall)
    # f1 = 2 * (precision * recall) / (precision + recall)
    # # print(f"F1: {f1}")
    # final_result["F1"] = f1
    # final_result["Recall"] = recall

    # for col in final_result:
    #     print(f"{col}: {final_result[col]:.2f}")
    
    # return eval_df

In [18]:
# eval_tag(
#     cand_response_filename="../experiments/r-v7_r211_a422_TAG_tr27_0629-step-0.json",
# )

# RUN eval

In [None]:
# name = "r-v7_r256_a512_ours_tr6_0503-checkpoint-63"
# name = "r-v7_r256_a512_ours_tr18_0503-checkpoint-52"
# name = "r-v7_r256_a512_ours_tr30_0503-checkpoint-54"
# name = "r-v7_r256_a512_ours_tr45_0503-checkpoint-95"
# name = "r-v7_r256_a512_ours_tr60_0503-checkpoint-108"

# name = "r-v7_r256_a512_woall_tr6_0503-checkpoint-28"
# name = "r-v7_r256_a512_woall_tr18_0503-checkpoint-70"
# name = "r-v7_r256_a512_woall_tr30_0503-checkpoint-57"
# name = "r-v7_r256_a512_woall_tr45_0503-checkpoint-95"
# name = "r-v7_r256_a512_woall_tr60_0503-checkpoint-90"

names = [
"r-v7_r211_a422_ours_tr27_0629-step-25",
# "r-v7_r211_a422_woExp_tr27_0623-step-55",
# "r-v7_r211_a422_WoMetadata+Thinking_tr27_0629-step-55",
# "r-v7_r211_a422_WoThinking_tr27_0629-step-54",
# "r-v7_r211_a422_woQM_tr27_0629-step-46",
# "r-v7_r211_a422_woQM+Script_tr27_0629-step-55",
# "r-v7_r211_a422_woScript_tr27_0629-step-55"
]

# names = [
# # "../r-v7_r211_a422_TAG_tr27_0629-step-0.json"
# "r-v7_r211_a422_ours_tr27_0623-step-54",
# "r-v7_r211_a422_woExp_tr27_0623-step-55",
# "r-v7_r211_a422_WoMetadata_tr27_0623-step-60",
# "r-v7_r211_a422_WoMetadata+Thinking_tr27_0623-step-55",
# "r-v7_r211_a422_WoThinking_tr27_0623-step-51",
# "r-v7_r211_a422_woQM_tr27_0623-step-53",
# "r-v7_r211_a422_woQM+Script_tr27_0623-step-60",
# "r-v7_r211_a422_woScript_tr27_0623-step-55"
# ]

for name in names:
    eval_query(
        f"../experiments/{name}.json"
    )

  2%|▏         | 1/52 [00:01<01:24,  1.66s/it]

rg_last_input_token_length, 155 ,rg_last_output_token_length, 39


  4%|▍         | 2/52 [00:03<01:15,  1.50s/it]

rg_last_input_token_length, 193 ,rg_last_output_token_length, 34


  6%|▌         | 3/52 [00:07<02:26,  2.99s/it]

rg_last_input_token_length, 67 ,rg_last_output_token_length, 14


  8%|▊         | 4/52 [00:10<02:10,  2.72s/it]

rg_last_input_token_length, 90 ,rg_last_output_token_length, 25


 10%|▉         | 5/52 [00:14<02:37,  3.36s/it]

rg_last_input_token_length, 104 ,rg_last_output_token_length, 15


 12%|█▏        | 6/52 [00:16<02:11,  2.86s/it]

rg_last_input_token_length, 59 ,rg_last_output_token_length, 14


 13%|█▎        | 7/52 [00:18<01:53,  2.52s/it]

rg_last_input_token_length, 67 ,rg_last_output_token_length, 10


 15%|█▌        | 8/52 [00:21<01:56,  2.66s/it]

rg_last_input_token_length, 225 ,rg_last_output_token_length, 114


 19%|█▉        | 10/52 [00:21<01:03,  1.52s/it]

rg_last_input_token_length, 60 ,rg_last_output_token_length, 16


 21%|██        | 11/52 [00:22<00:57,  1.41s/it]

rg_last_input_token_length, 69 ,rg_last_output_token_length, 19


 23%|██▎       | 12/52 [00:23<00:47,  1.19s/it]

rg_last_input_token_length, 71 ,rg_last_output_token_length, 17


 27%|██▋       | 14/52 [00:24<00:30,  1.24it/s]

rg_last_input_token_length, 69 ,rg_last_output_token_length, 19


 29%|██▉       | 15/52 [00:25<00:31,  1.19it/s]

rg_last_input_token_length, 109 ,rg_last_output_token_length, 38


 33%|███▎      | 17/52 [00:25<00:22,  1.58it/s]

rg_last_input_token_length, 89 ,rg_last_output_token_length, 22


 35%|███▍      | 18/52 [00:26<00:22,  1.49it/s]

rg_last_input_token_length, 103 ,rg_last_output_token_length, 30


 37%|███▋      | 19/52 [00:28<00:30,  1.10it/s]

rg_last_input_token_length, 82 ,rg_last_output_token_length, 18


 38%|███▊      | 20/52 [00:30<00:39,  1.24s/it]

rg_last_input_token_length, 78 ,rg_last_output_token_length, 16


 42%|████▏     | 22/52 [00:32<00:38,  1.27s/it]

rg_last_input_token_length, 136 ,rg_last_output_token_length, 44


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7f752622c430>>
Traceback (most recent call last):
  File "/venv/main/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


In [43]:
responses = {}

for name in names:
    # print(name, exp_tag)
    exp_tag = name.split("_")[3]
    with open(f"../experiments/result_23/{name}_response.json", "r", encoding="utf-8") as f:
        data = json.load(f)
        for item in data:
            input = item["Input"]
            if input not in responses:
                responses[input] = {
                    "GT_Response": item["GT_Response"],
                }
            # if "GT_Response" in item:
            #     print(item["GT_Response"])
            if "PD_Response" in item:
                pd_response = item["PD_Response"]
            else:
                pd_response = "실행중 에러 발생"
            
            responses[input][f"{exp_tag}"] = pd_response

# if the response is exactly equal, then merge them and make in to one, key is then tuple
for input, response in responses.items():
    if len(response) == 1:
        continue
    
    # merge every matching pd_response (not only first one but every combination)
    # create groups of responses with same values
    from collections import defaultdict
    
    # group responses by their values (excluding GT_Response)
    value_groups = defaultdict(list)
    
    for key, value in response.items():
        if key != "GT_Response":
            value_groups[value].append(key)
    
    # merge keys that have the same response values
    merged_responses = {}
    for value, keys in value_groups.items():
        if len(keys) > 1:
            # create tuple key for merged responses
            merged_key = str(tuple(sorted(keys)))
            merged_responses[merged_key] = value
        else:
            # keep single responses as is
            merged_responses[keys[0]] = value
    
    # add back GT_Response
    merged_responses["GT_Response"] = response["GT_Response"]
    
    # for key in merged_responses:
    #     if isinstance(merged_responses[key], list):
    #         merged_responses[key] = " ".join(merged_responses[key])

    # update responses dict
    responses[input] = merged_responses

# delete input = 오늘 오후 5시에 옆반의 설정온도는 어땠어?
del responses["오늘 오후 5시에 옆반의 설정온도는 어땠어?"]
del responses["지금 4층 평균 실내온도 알려줘"]

# import pprint
# pprint.pprint(responses)

# save to json
with open("responses.json", "w", encoding="utf-8") as f:
    json.dump(responses, f, ensure_ascii=False, indent=4)
print(responses)

{'이번주 우리반과 앞반의 평균 온도 알려줘': {"('WoThinking', 'ours')": '이번주 우리반의 평균 실내온도는 25.98°C이고, 앞반의 평균 실내온도는 25.11°C입니다.', 'woExp': '이번주 우리반과 앞반의 평균 온도는 25.55°C입니다.', 'WoMetadata': '이번주 우리반과 앞반의 평균 실내온도는 25.67°C 입니다.', 'WoMetadata+Thinking': '이번주 우리반과 앞반의 평균 온도는 25.39°C 입니다.', 'woQM': '이번주 우리반과 앞반의 평균 실내온도는 25.55°C 입니다.', 'woQM+Script': '이번주 우리반의 평균 실내온도는 27.00°C이고, 앞반의 평균 실내온도는 26.50°C입니다. 이번주 우리반의 평균 실내온도는 앞반의 평균 실내온도보다 0.50°C 높습니다.', 'woScript': '이번주 우리반의 평균 실내온도(27.00°C)는 앞반의 평균 실내온도(26.50°C)보다 0.50°C 높습니다.', 'GT_Response': '이번주 우리반의 평균 실내온도는 25.98°C이고, 앞반의 평균 실내온도는 25.11°C입니다.'}, '현재 설정온도랑 실내온도 차이 알려줘.': {'ours': '현재 우리반 설정온도 데이터와 실내온도 데이터를 찾을 수 없습니다.', 'woExp': '현재 설정온도(23.00°C)와 실내온도(27.00°C)의 차이는 4.00°C입니다.', "('WoMetadata', 'WoMetadata+Thinking')": '실행중 에러가 발생했습니다.', 'WoThinking': '현재 설정온도(23.00°C)는 실내온도(27.00°C)보다 -4.00°C 낮습니다.', 'woQM': '현재 설정온도(23.00°C)와 실내온도(28.50°C)의 차이는 -5.50°C입니다.', 'woQM+Script': '현재 우리반의 설정온도(23.00°C)는 실내온도(27.00°C)보다 -4.00°C 낮습니다.', 'woScript': '현재 옆반의 설정온도(23.0