In [1]:
# autoreload
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os

# Add the parent directory of src to the path
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.db.manager import DBManager
from src.input_to_instructions.load_and_execute import *
from src.input_to_instructions.types import *
from src.plot_graph.execute import *
from src.operation.execute import *

In [3]:
from collections import defaultdict, Counter
import logging

import pandas as pd
import numpy as np
from tqdm import tqdm
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

import json
import itertools

from db.manager import DBManager
from operation.execute import OperationExecutor
from pathlib import Path
import warnings
import datetime


warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)

INFO:db.instance:Connected to the database PerSite_DB


In [4]:
BASE_DIR = "../"
def read_json(path):
    with open(path, "r", encoding="utf-8") as f:
        result = json.loads(f.read())
    
    # result = [{"Input": d["Input"], "Response": json.dumps(d["Response"], ensure_ascii=False)} for d in result]
    return result

In [5]:
import matplotlib.pyplot as plt
import matplotlib

from src.plot_graph.execute import plot_graph_plotly
matplotlib.rcParams['font.family'] = 'NanumGothicCoding'
def run_query(user_input, metadata, instructions):
    variables = {
        "Metadata": metadata,
    }
    for instruction in instructions:
        # logger.debug(f"Executing instruction: {instruction.__class__.__name__}")
        # print(f"Executing instruction: {instruction.__class__.__name__}")
        
        if type(instruction) == InstructionQ:
            # Execute query
            result_df = DBManager.structured_query_data_t(metadata, instruction.args)
            if result_df is None:
                print("죄송합니다, 관련 데이터를 찾을 수 없습니다.", "response")
                return

            # For demo, drop rows where any value is -1
            result_df = result_df.loc[(result_df != -1).all(axis=1)]
           
            #pd.set_option('display.max_rows', 10000)        
            #pd.set_option('display.max_columns', 1000)
            #pd.set_option('display.width', 1000)
            #pd.set_option('display.max_colwidth', 1000)
            #print(f"QueryResult: {result_df}")

            variables[instruction.result_name] = result_df
        
        elif type(instruction) == InstructionO:
            # Execute operation

            result_dict = OperationExecutor.execute(variables, instruction.scripts, instruction.returns)
            variables.update(result_dict)
            pass
        elif type(instruction) == InstructionG:

            # fig = plot_graph(instruction, variables)
            # plt.show(fig)
            
            fig = plot_graph_plotly(instruction, variables)
            #plt.show(fig)
            fig.show()
            #print(type(fig))
            # print(fig, "graph")
        elif type(instruction) == InstructionR:
            pass
            # # Execute response generation
            # variables_to_report = {k: v for k, v in variables.items() if k not in ["Metadata"]}
            # print(f"Variables: {variables_to_report}")
            # response, required_variables = ResponseGeneration.execute(instruction, variables, user_input, metadata)
            # print(f"Required variables: {required_variables}")
            
            # print(response, "response")

In [7]:
def build_query_groundtruth(dateset_name):
    def read(path):
        data = read_json(path)
        for i, d in enumerate(data):
            data[i]["Scenario"] = directory.name
            if "v7" in dateset_name:
                data[i]["Metadata"] = metadata
        return data

    ds_ts = []
    ds_tr = []
    base_dataset_dir = Path(f"{BASE_DIR}/finetuning/dataset/{dateset_name}")
    
    for directory in base_dataset_dir.iterdir():
        if directory.is_dir() and "scenario1" in directory.name:
            if "v7" in dateset_name:
                metadata = read_json(f"{directory}/metadata.json")
                
            
            # ds_ts.extend(read(f"{directory}/onlyq_ts.json"))
            #ds_tr.extend(read(f"{directory}/onlyq_tr.json"))
            ds_tr.extend(read(f"{directory}/graph_temp.json"))
    
    ds = ds_ts + ds_tr
    
    # if "v7" in dateset_name:
    #     db_gt_filename = f"{BASE_DIR}/experiments/db_gt_v7.json"
    # else:
    #     db_gt_filename = f"{BASE_DIR}/experiments/db_gt.json"
    #     metadata = None
    
    # with open(db_gt_filename, "w", encoding="utf-8") as f:
        # f.write("[")
    # with tqdm(total=len(ds)) as pbar:
    #count=0
    for d in ds:
        #count += 1
        #if count < 10:
        #    continue  # 10보다 작은 값은 건너뛰기
        #if count > 40:
        #    break     # 20을 초과하면 반복 종료
        # pbar.set_description(f"Processing {d['Input']}")
        # print("--")
        
        instructions = InputToInstruction.postprocess(d['Response'])
        user_input, tags, metadata, scenario = d["Input"], d["Tags"], d["Metadata"], d["Scenario"]
        
        run_query(user_input, metadata, instructions)
        
  

In [None]:

build_query_groundtruth("v7-250309-reduceinputanddatefunctioncall")