In [48]:
import json
import argparse

from utils import invoke, evaluate_time_predictions
from threadpool import ThreadPool
from tqdm import tqdm

prompt_file = "prompts/v4.txt"
bench_file = "../eval_ppl/dataset/multi_time_queries.json"

with open(prompt_file) as f:
    prompt = f.read()

with open(bench_file) as f:
    time_bench = json.load(f)

# Function to process a single example
def process_example(example, prompt):
    # params = {
    #     "question": example["question"],
    #     "current_time_iso": example["now"],
    #     "date_field": "timestamp",
    #     "other_date_fields": []
    # }
    params = {
        "question": example["question"],
        "current_time_iso": example["now"][11:-2],
        "date_field": example["date_field"],
        "other_date_fields": example["other_date_fields"]
    }
    
    return invoke(prompt, **params)


# Create a thread pool
thread_pool = ThreadPool(max_workers=2)

# Use the thread pool to process all examples in parallel
predicts = thread_pool.map_with_args(
    func=process_example,
    items=time_bench[:],
    fixed_args={"prompt": prompt},
    desc="Processing examples",
)

print(predicts)

Processing examples:   0%|          | 0/8 [00:00<?, ?it/s]

Processing examples: 100%|██████████| 8/8 [00:07<00:00,  1.11it/s]

[{'analysis': '1. Time expressions identified: ["from last week", "created after 2016-12-20"]\n2. After filtering aggregation references: ["from last week", "created after 2016-12-20"]\n3. Semantic context analysis:\n   - "from last week" modifies "orders" → maps to order_date\n   - "created after 2016-12-20" modifies "products" → maps to products.created_on\n4. Time expressions classification:\n   - ignored time expressions: ["created after 2016-12-20"]\n   - selected time expressions: ["from last week"]', 'start': '2025-05-20 03:53:30', 'end': '2025-05-27 03:53:30', 'query': 'Show me all orders with products created after 2016-12-20'}, {'analysis': '1. Time expressions identified: ["after 2016-12-20"]\n2. After filtering aggregation references: ["after 2016-12-20"]\n3. Semantic context analysis:\n   - "after 2016-12-20" modifies "products" → maps to products.created_on\n4. Time expressions classification:\n   - ignored time expressions: ["after 2016-12-20"]\n   - selected time expres




In [49]:
for i in range(6,8):
    for key in ["start", "end"]:
        if predicts[i][key]!=None:
            predicts[i][key] = predicts[i][key].replace("T", " ")
    
    print(time_bench[i]["now"], time_bench[i]["question"])
    print(predicts[i]["start"], predicts[i]["end"])
    print(predicts[i]["query"])
    print()
    print(predicts[i]["analysis"])
    print("*"*100)

TIMESTAMP('2025-05-27 03:53:30') List all unique product manufacturers for products created during 2016 12-10 and 12-15 that were sold in the last 30 days.
2025-04-27 03:53:30 2025-05-27 03:53:30
List all unique product manufacturers for products sold in the last 30 days.

1. Time expressions identified: ["2016 12-10 and 12-15", "the last 30 days"]
2. After filtering aggregation references: ["2016 12-10 and 12-15", "the last 30 days"]
3. Semantic context analysis:
   - "2016 12-10 and 12-15" modifies "products" → maps to products.created_on
   - "the last 30 days" modifies "products sold" → maps to order_date
4. Time expressions classification:
   - ignored time expressions: ["2016 12-10 and 12-15"]
   - selected time expressions: ["the last 30 days"]
****************************************************************************************************
TIMESTAMP('2025-05-27 03:53:30') List all unique product manufacturers for products created during 2016 12-10 and 12-15
None None
List al

In [1]:
sample = {
        "analysis": "1. Time expressions identified: [\"on May 15\"]\n2. After filtering aggregation references: [\"on May 15\"]\n3. Semantic context analysis:\n   - \"on May 15\" modifies \"orders\" \u2192 maps to order_date\n4. Time expressions classification:\n   - ignored time expressions: []\n   - selected time expressions: [\"on May 15\"]",
        "start": "2025-05-15 00:00:00",
        "end": "2025-05-15 23:59:59",
        "question": "Show me all orders",
        "date_field": "order_date",
        "now": "TIMESTAMP('2025-05-27 03:53:30')",
        "target_index": "ecommerce",
        "origin_query": "source=ecommerce \n| where DATE(order_date) = '2025-05-15'\n| fields customer_full_name, order_date, total_quantity, taxless_total_price",
        "query": "source=ecommerce| fields `order_id`, `customer_full_name`, `customer_last_name`, `taxful_total_price`, `products.product_name`"
    }

In [4]:
def insert_after_first_pipe(query, insert_text):
    parts = query.split("|", 1)
    if len(parts) < 2:
        return query

    new_query = parts[0] + "|" + insert_text + "|" + parts[1]
    return new_query

query = sample["query"]
if "start" in sample and sample["start"] is not None:
    query = insert_after_first_pipe(
        sample["query"], f"where {sample['date_field']} > TIMESTAMP({sample['start']}) "
    )
    print("here")
    print(query)
if "end" in sample and sample["end"] is not None:
    query = insert_after_first_pipe(
        sample["query"], f"where {sample['date_field']} < TIMESTAMP({sample['end']}) "
    )
if "date_field" in sample:
    query = insert_after_first_pipe(
        sample["query"], f"where {sample['date_field']} < NOW() "
    )
if "now" in sample:
    query = query.replace("NOW()", sample["now"])
sample["query"] = query

here
source=ecommerce|where order_date > TIMESTAMP(2025-05-15 00:00:00) | fields `order_id`, `customer_full_name`, `customer_last_name`, `taxful_total_price`, `products.product_name`


In [5]:
"start" in sample and sample["start"] is not None

True

In [1]:
import json

with open("../eval_ppl/dataset/time_related_queries.json") as f:
    samples = json.load(f)
    
with open("../eval_ppl/dataset_parsed/time_related_queries.json") as f:
    parsed = json.load(f)
    
with open("../eval_ppl/dataset_parsed/time_related_queries.json") as f:
    parsed = json.load(f)
    
for i in range(len(samples)):
    if samples[i]["time_range"][0]!= parsed[i]["start"] or samples[i]["time_range"][1]!= parsed[i]["end"]:
        print(i)

9
81


In [None]:
with open("../eval_ppl/results/gold/time_related_queries.json") as f:
    gold = json.load(f)
    
with open("../eval_ppl/results/baseline/time_related_queries.json") as f:
    baseline = json.load(f)
    
with open("../eval_ppl/results/baseline-parsed/time_related_queries.json") as f:
    parsed = json.load(f)
    
for i in range(len(samples)):
    if samples[i]["time_range"][0]!= parsed[i]["start"] or samples[i]["time_range"][1]!= parsed[i]["end"]:
        print(i) 