In [1]:
import sqlite3
import pandas as pd
import dspy
import dotenv, os
from pydantic import BaseModel, Field
import openai
import pm4py
import ast
import networkx as nx
from numpy import random
from dspy.evaluate import Evaluate
from collections import defaultdict
import tqdm as notebook_tqdm
import copy
import re
import traceback
from queue import Queue
import threading
from dspy.teleprompt import BootstrapFewShot
from dspy.teleprompt import BootstrapFewShotWithRandomSearch
import phoenix as px
from openinference.instrumentation.dspy import DSPyInstrumentor
from opentelemetry import trace as trace_api
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk import trace as trace_sdk
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
from chroma_retriever import Chroma
from chromadb.utils import embedding_functions
from contextlib import contextmanager
from dspy.primitives.assertions import assert_transform_module, backtrack_handler
import functools
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2")

In [2]:
from SQL_programs.sql_reasoning import PM_SQL_multi_sp

In [3]:
# importing the programs & utils
from Combined_programs.combined import PM_combined
from Combined_programs.combined_perfect_decision import PM_combined_perfect_d
from Combined_programs.pm_isolated import PM_isolated

# utils
from Utils.column_dependency import DependencyGraph
from Utils.saving_functions import save_report_v2, save_report_isolated
from SQL_programs.sql_llm_judge import LM_EVAL

In [4]:
INPUT_FILE_NAME = "/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/dataset/Road_Traffic_Fine_Management_Process.xes" #replce with your file path
COLUMN_INSTRUCTIONS = "/Users/sulzair/Documents/Bachelor Thesis/Proof-of-Concept/Experiments/PM_EVALQUESTIONS_final.csv"
SQL_QUESTIONS = '/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/benchmark/sql_questions_to_splitt.csv'
SQLITE_DB_NAME = "combined.db" #"my_database.db" #leve as is
SQLITE_DB_ISOLATED = "isolated.db"
LLM_MODEL_TYPE = "gpt-4o" #gpt-4-turbo" #"gpt-3.5-turbo-0125" #leave as is for gpt 3.5 or change to "gpt-4-1106-preview" for gpt 4
PM_PY_PATH = "/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/Optimized_prompts/python/py_add_fewshot_12.json"
PM_SQL_PATH = "/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/Optimized_prompts/sql/sql_bootstrap_bootstrap_fewshot_1.json" # potentially we can go higher
JUDGE_PATH = "/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/Optimized_prompts/judge/judge_optimized_final.json"

In [5]:
phoenix_session = px.launch_app()
endpoint = "http://127.0.0.1:6006/v1/traces"
resource = Resource(attributes={})
tracer_provider = trace_sdk.TracerProvider(resource=resource)
span_otlp_exporter = OTLPSpanExporter(endpoint=endpoint)
tracer_provider.add_span_processor(SimpleSpanProcessor(span_exporter=span_otlp_exporter))

trace_api.set_tracer_provider(tracer_provider=tracer_provider)
DSPyInstrumentor().instrument()

🌍 To view the Phoenix app in your browser, visit http://localhost:6006/
📖 For more information on how to use Phoenix, check out https://docs.arize.com/phoenix


In [None]:
# Get all evaluations for a project
evals = px.Client().get_evaluations(project_name="your_project")

# Get spans with evaluations as dataframe
spans_df = px.Client().get_spans_dataframe()

In [110]:
spans_df = px.Client().get_spans_dataframe()

  df_attributes = pd.DataFrame.from_records(


In [112]:
len(spans_df)

319

In [None]:
spans_df = pd.read_csv("my_spans.csv")

# Launch Phoenix with loaded spans
import phoenix as px
px.launch_app(trace=spans_df)

In [113]:
spans_df.to_csv("/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/Saved Spans/my_spans.csv", index=False)

In [108]:
evals = px.Client().get_evaluations(project_name="default")

In [109]:
evals

[]

In [107]:

my_traces = px.TraceDataset.save("/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/Saved Spans/")



AttributeError: 'str' object has no attribute 'evaluations'

In [None]:
px.launch_app(trace=px.TraceDataset.load(my_traces))

In [6]:
dotenv.load_dotenv()
lm = dspy.LM('openai/gpt-4o', temperature=0.3, max_tokens=4096, stop=None, cache=False)
dspy.settings.configure(lm = lm)

In [7]:
df = pm4py.read_xes(INPUT_FILE_NAME)
df.columns = df.columns.str.replace(':', '_', regex=False)
# replace nan values with 0 in columns "amount", "expense", "paymentAmount", "totalPaymentAmount"
df['amount'] = df['amount'].fillna(0)
df['expense'] = df['expense'].fillna(0)
df['paymentAmount'] = df['paymentAmount'].fillna(0)
df['totalPaymentAmount'] = df['totalPaymentAmount'].fillna(0)
#fturn euros into cents
df['amount'] = df['amount'] * 100
df['amount'] = df['amount'].astype(int)
df['expense'] = df['expense'] * 100
df['expense'] = df['expense'].astype(int)
df['paymentAmount'] = df['paymentAmount'] * 100
df['paymentAmount'] = df['paymentAmount'].astype(int)
df['totalPaymentAmount'] = df['totalPaymentAmount'] * 100
df['totalPaymentAmount'] = df['totalPaymentAmount'].astype(int)

conn = sqlite3.connect(SQLITE_DB_NAME)
cur = conn.cursor()
cur.execute("DROP TABLE IF EXISTS temp_table;")

print(cur.fetchall())
conn.commit()

df.to_sql("event_log", conn, if_exists="replace", index=False)
cur = conn.cursor()
indexes = [
    "CREATE INDEX IF NOT EXISTS idx_case_concept_name ON event_log(case_concept_name);",
    "CREATE INDEX IF NOT EXISTS idx_concept_name ON event_log(concept_name);",
    "CREATE INDEX IF NOT EXISTS idx_timestamp ON event_log(time_timestamp);"
]
for index_query in indexes:
    cur.execute(index_query)
conn.commit()

cur.execute("ALTER TABLE event_log ADD COLUMN idx INTEGER;")
conn.commit()
cur.execute("""UPDATE event_log SET idx = (
    SELECT rowid FROM (
        SELECT ROW_NUMBER() OVER (ORDER BY case_concept_name, time_timestamp) as seq_num, rowid
        FROM event_log
    ) temp WHERE temp.rowid = event_log.rowid
);""")
conn.commit()
cur.execute("CREATE INDEX idx_event_log_idx ON event_log(idx);")
conn.commit()
cur.execute("VACUUM;")
cur.execute("ANALYZE;")
conn.commit()
cur.close()

parsing log, completed traces ::   0%|          | 0/150370 [00:00<?, ?it/s]

[]


# Initialize Chroma Retriever

In [92]:
# Initialize the Chroma retriever

col_desc = """- 'amount' (int): The amount due to be paid for the fine (including the penalty amount in case it is added). There are no nan values in this column.
- 'org_resource' (int): A numeric code indicating the employee who handled the case.
- 'dismissal' (string): A flag indicating whether and by whom the fine is dismissed. It is initialized to NIL. We know the meaning of:  
        'G': dismissed by the judge
        '#': dismissed by the prefecture
        NIL: not dismissed, i.e., to be paid.
        There are several other values used for which we do not know the semantics.
- 'vehicleClass' (string): A flag indicating the kind of vehicle driven or owned by the offender. The semantics of the values is unknown.
- 'totalPaymentAmount' (int): The cumulative amount paid by the offender. It is always initialized to 0. There are no nan values in this column.
- 'lifecycle_transition' (string): the transition of the activity (complete, start, etc.)
- 'article' (string): The number of the article of the Italian roadtraffic law that is violated by the offender (e.g., article 157 refers to stopping and parking vehicles).
- 'points' (float): Penalty points deducted from the driving license. In Italy, each driver starts with 20 points on their license and may loose points for each offence, based on the gravity.
- 'expense' (int): The additional amount due to be paid for postal expenses. There are no nan values in this column.
- 'notificationType' (string): A flag indicating to whom the fine refers. 'P': car owner, 'C': car driver.
- 'lastSent' (datetime): N/A
- 'paymentAmount' (int): The amount paid by the offender in one transaction. There are no nan values in this column.
- 'matricola' (string): N/A (Probably refers to the matriculation number of the car.)
- 'concept_name' (string): the activity/ event type name
        Activity Description, column: 'concept:name':
            > 'Create Fine': The initial creation of the fine in the information system. It initializes event log attributes amount, dismissal, points and totalPaymentAmount.
            > 'Send Fine': A notification about the fine is sent by post to the offender.
            > 'Insert Fine Notification': The notification is received by the offender.
            > 'Add penalty': An additional penalty is applied.
            > 'Payment': A payment made by the offender is registered.
            > 'Send for Credit Collection': Unpaid fines are sent for credit collection. A separate process is started by a collection agency to collect the money of the unpaid fines.
            > 'Insert Date Appeal to Prefecture': The offender appeals against the fine to the prefecture. A prefecture in Italy is an administrative body representing the national government in each province.
            > 'Send Appeal to Prefecture': The appeal is sent to the prefecture by the local police.
            > 'Receive Result Appeal from Prefecture': The local police receives the result of the appeal. If the prefecture dismisses the fine, the appeal is deemed accepted, and the obligation to pay the fine is cancelled. In this case, there is no need for the police to receive the result from the prefecture (Receive Result Appeal from Prefecture) and notify the offender (Notify Result Appeal to Offender).
            > 'Notify Result Appeal to Offender': The local police informs the offender of the appeal result. 
            > 'Appeal to Judge': The offender appeals against the fine to a judge.
        IMPORTANT: The last event in a case can be arbitrary. There is no guarantee that the last event is 'Send Fine' or 'Payment'. The last event can be any event in the log."""

rm = Chroma(sentence_transformer_ef = sentence_transformer_ef , documentation = col_desc) # for python, use standard cols

["- 'amount' (int): The amount due to be paid for the fine (including the penalty amount in case it is added). There are no nan values in this column.", "- 'org_resource' (int): A numeric code indicating the employee who handled the case.", "- 'dismissal' (string): A flag indicating whether and by whom the fine is dismissed. It is initialized to NIL. We know the meaning of:  \n        'G': dismissed by the judge\n        '#': dismissed by the prefecture\n        NIL: not dismissed, i.e., to be paid.\n        There are several other values used for which we do not know the semantics.", "- 'vehicleClass' (string): A flag indicating the kind of vehicle driven or owned by the offender. The semantics of the values is unknown.", "- 'totalPaymentAmount' (int): The cumulative amount paid by the offender. It is always initialized to 0. There are no nan values in this column.", "- 'lifecycle_transition' (string): the transition of the activity (complete, start, etc.)", "- 'article' (string): The

In [79]:
rm.cols # is the case:concept:name and time:timestamp in there?

['amount',
 'org_resource',
 'dismissal',
 'vehicleClass',
 'totalPaymentAmount',
 'lifecycle_transition',
 'article',
 'points',
 'expense',
 'notificationType',
 'lastSent',
 'paymentAmount',
 'matricola',
 'concept_name']

# Initialize SQL, PY and combined Classes

In [93]:
class SQLiteConnectionPool:
    def __init__(self, database, max_size=10):
        self.database = database
        self.pool = Queue(maxsize=max_size)
        for _ in range(max_size):
            self.pool.put(self.create_new_connection())

    def create_new_connection(self):
        return sqlite3.connect(self.database, check_same_thread=False)

    def get_connection(self):
        return self.pool.get()

    def release_connection(self, conn):
        self.pool.put(conn)

pool = SQLiteConnectionPool(SQLITE_DB_NAME, max_size=10)

In [94]:

col_instructions = pd.read_csv(COLUMN_INSTRUCTIONS)

dp_graph = DependencyGraph(rm.cols, col_instructions) # check what is in rm.cols (it appends the timestamp and case concept name columns)

In [95]:
required_cols_test = ['appeal_judge_cancelled']

In [96]:
rm.cols

['amount',
 'org_resource',
 'dismissal',
 'vehicleClass',
 'totalPaymentAmount',
 'lifecycle_transition',
 'article',
 'points',
 'expense',
 'notificationType',
 'lastSent',
 'paymentAmount',
 'matricola',
 'concept_name',
 'case_concept_name',
 'time_timestamp']

In [97]:
testing_cols_generate = dp_graph.cols(required_cols_test, rm.cols)
print(testing_cols_generate)

['appealed_to_judge', 'dismissed_by_judge', 'appeal_judge_cancelled']


load the combined.py program

In [17]:
py_combined = assert_transform_module(
    PM_combined(
        dp_graph = dp_graph, 
        rm=rm, 
        pool=pool,
        conn_path= SQLITE_DB_NAME,
        pm_py_path= PM_PY_PATH,
        pm_sql_path=PM_SQL_PATH ), 
        functools.partial(backtrack_handler, max_backtracks=4))

py_combined.save("/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/Optimized_prompts/combined/combined_signatures.json") # save the signature
py_combined.pm_py.load(PM_PY_PATH) # load the demonstrations
py_combined.pm_sql.load(PM_SQL_PATH) # load the demonstrations
py_combined.save("/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/Optimized_prompts/combined/combined_signatures.json") # save including the demonstrations


load the combined_perfect decisions.py program

In [12]:
py_combined_perfect = assert_transform_module(
    PM_combined_perfect_d(
        dp_graph = dp_graph, 
        rm=rm, 
        pool=pool,
        conn_path= SQLITE_DB_NAME,
        pm_py_path= PM_PY_PATH,
        pm_sql_path=PM_SQL_PATH ), 
        functools.partial(backtrack_handler, max_backtracks=4))


In [13]:
py_combined_perfect.save("/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/Optimized_prompts/combined/combined_signatures_perfect.json") # save the signature
py_combined_perfect.pm_py.load(PM_PY_PATH) # load the demonstrations
py_combined_perfect.pm_sql.load(PM_SQL_PATH) # load the demonstrations
py_combined_perfect.save("/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/Optimized_prompts/combined/combined_signatures_perfect.json") # save including the demonstrations

# Load the Train and Test data

In [18]:
qa = pd.read_csv(SQL_QUESTIONS)

In [41]:
qa.head(100)

Unnamed: 0,Category,Question,Answer,Direct col to use,Split
0,Generic Questions,How many events are in the log?,561470,,test
1,,How many cases are in the log?,150370,,train
2,,When is the start of the event log?,1/1/2000 / 2000-01-01,,test
3,,When is the end of the event log?,6/18/2013/ 2013-06-18,,test
4,Activity Count (log),How many Create Fine events occur?,150370,,train
...,...,...,...,...,...
95,,What is the lowest amount last found in the event log?,0,['amount_last'],test
96,,How many cases have the lowest amount last found in the event log?,36,['amount_last'],test
97,,Which case has the highest outstanding balance?,C18395:\t801985,['outstanding_balance'],test
98,,What is the average outstanding balance per case?,5571.34701,['outstanding_balance'],test


In [20]:
trainset = []
testset = []
for category, question, answer, cols, split in qa.values:
    print(type(cols) == float)
    py = dspy.Example(question = question, example = answer, req_cols = cols).with_inputs("question", "req_cols")
    if split == "train":
        trainset.append(py)
    else:
        testset.append(py)

True
True
True
True
True
True
True
True
False
True
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
True
True
True
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
True
True
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False

In [21]:
print(len(trainset), len(testset))


78 103


# Eval Class

import from sql_llm_judge.py the LM_EVAL program

In [98]:
judge_lm = dspy.LM(model='openai/gpt-4o', temperature=1, max_tokens=3000, stop=None, cache=True, seed= 99)

judge_adjusted = LM_EVAL(judge_lm)
judge_adjusted.load(JUDGE_PATH)

# Eval of combined

**'openai/gpt-4o-mini', temperature=0.3, max_tokens=4096,**

- pm_combined (check + reasoning uncompiled) (using py_add_fewshot_12.json & sql_bootstrap_bootstrap_fewshot_1.json) (assertions activated): score: 168.0 time: 32m 16
Name: testset_pm_combined_decision_uc

(maybe we run this again (not urgent))


- pm_combined_perfect (check + reasoning uncompiled) (using py_add_fewshot_12.json & sql_bootstrap_bootstrap_fewshot_1.json) (assertions activated): score:164.1 time: 31m 50s Name: testset_pm_combined_perfect_decision_uc

**'openai/gpt-4o', temperature=0.3, max_tokens=4096,**

- pm_combined (check + reasoning uncompiled) (using py_add_fewshot_12.json & sql_bootstrap_bootstrap_fewshot_1.json) (assertions activated): score: 154.4 time: 28m 53
Name: testset_pm_combined_decision_uc_4o

- pm_combined_perfect (check + reasoning uncompiled) (using py_add_fewshot_12.json & sql_bootstrap_bootstrap_fewshot_1.json) (assertions activated): score:116.5 time: 31m 05s Name: testset_pm_combined_perfect_decision_uc_4o

second run (after some weird results)

**-> potentially something wrong (restarted notebook)**

- pm_combined_perfect (check + reasoning uncompiled) (using py_add_fewshot_12.json & sql_bootstrap_bootstrap_fewshot_1.json) (assertions activated): score:182.5% time: 29m 30s Name: testset_pm_combined_perfect_decision_uc_4o_run2

Notes: All columns generated successfully & correctly. very good. We have a slight mismatch between precision (0.7) and recall (1), whereby the model will more often than not, tend to lean towards using the sql query instead of python code to help out. Its a little too self-sufficient


- pm_combined (check + reasoning compiled (combined_optimized.json)) (using py_add_fewshot_12.json & sql_bootstrap_bootstrap_fewshot_1.json) (assertions activated): score: 172.8% time: 29m 12
Name: testset_pm_combined_decision_compiled_4o



**SQL Isolated**
gpt 4o, temp .3
- PM_SQL_multi_sp compiled(sql_bootstrap_bootstrap_fewshot_1.json) (assertions active) 106.8%, time: 10:48, Name: testset_pm_isolated_c_4o

- PM_isolated uncompiled (assertions active) Name: testset_pm_isolated_definitions_uc_4o Score:158.3 Time: 14m 33s 

gpt4-mini temp .3

- PM_SQL_multi_sp compiled(sql_bootstrap_bootstrap_fewshot_1.json) (assertions active) Score: 99.0 , time: , Name: testset_pm_isolated_c_mini

- PM_isolated uncompiled (assertions active) Name: testset_pm_isolated_definitions_uc_mini Score:161.2 Time: 14m 33s 

 normal stuff, reset db, EVAluate, + getting the FP,TP etc scores

In [27]:
len(trainset)

78

In [34]:
testset[63:64]

[Example({'question': 'How many cases are unresolved?', 'example': '28984', 'req_cols': "['unresolved']"}) (input_keys={'question', 'req_cols'})]

In [101]:
testset[18:19]

[Example({'question': 'How many cases have had their appeal rejected by the judge?', 'example': '462', 'req_cols': "['appeal_judge_cancelled']"}) (input_keys={'question', 'req_cols'})]

In [102]:
evaluate = Evaluate(devset=testset[18:19], metric=judge_adjusted, num_threads=1, display_progress=True, display_table=len(testset[18:19]), return_outputs=True, max_errors=10)

In [103]:
conn = sqlite3.connect(SQLITE_DB_NAME)
cur = conn.cursor()
cur.execute("DROP TABLE IF EXISTS temp_table;")

print(cur.fetchall())
conn.commit()

df.to_sql("event_log", conn, if_exists="replace", index=False)
cur = conn.cursor()
indexes = [
    "CREATE INDEX IF NOT EXISTS idx_case_concept_name ON event_log(case_concept_name);",
    "CREATE INDEX IF NOT EXISTS idx_concept_name ON event_log(concept_name);",
    "CREATE INDEX IF NOT EXISTS idx_timestamp ON event_log(time_timestamp);"
]
for index_query in indexes:
    cur.execute(index_query)
conn.commit()

cur.execute("ALTER TABLE event_log ADD COLUMN idx INTEGER;")
conn.commit()
cur.execute("""UPDATE event_log SET idx = (
    SELECT rowid FROM (
        SELECT ROW_NUMBER() OVER (ORDER BY case_concept_name, time_timestamp) as seq_num, rowid
        FROM event_log
    ) temp WHERE temp.rowid = event_log.rowid
);""")
conn.commit()
cur.execute("CREATE INDEX idx_event_log_idx ON event_log(idx);")
conn.commit()
cur.execute("VACUUM;")
cur.execute("ANALYZE;")
conn.commit()
cur.close()

# dont forget to reset the rm class



[]


seed 1
seed 2

4o seed 3, seed 4, seed 5, seed 6, seed 7, seed 8, seed9

In [104]:
lm = dspy.LM('openai/gpt-4o', temperature=0.3, max_tokens=4096, stop=None, cache=True, seed=399995)
dspy.settings.configure(lm = lm)

In [69]:
py_combined = assert_transform_module(
    PM_combined(
        dp_graph = dp_graph, 
        rm=rm, 
        pool=pool,
        conn_path= SQLITE_DB_NAME,
        pm_py_path= PM_PY_PATH,
        pm_sql_path=PM_SQL_PATH ), 
        functools.partial(backtrack_handler, max_backtracks=4))


py_combined.load("/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/Optimized_prompts/combined/combined_optimized.json") # save including the demonstrations


In [105]:
py_combined_perfect = assert_transform_module(
    PM_combined_perfect_d(
        dp_graph = dp_graph, 
        rm=rm, 
        pool=pool,
        conn_path= SQLITE_DB_NAME,
        pm_py_path= PM_PY_PATH,
        pm_sql_path=PM_SQL_PATH ), 
        functools.partial(backtrack_handler, max_backtracks=4))

py_combined_perfect.load("/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/Optimized_prompts/combined/combined_optimized.json")

In [30]:
py_combined_perfect.save("/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/Optimized_prompts/combined/sometest_delete.json") # save the signature

In [22]:
py_combined_perfect = assert_transform_module(
    PM_combined_perfect_d(
        dp_graph = dp_graph, 
        rm=rm, 
        pool=pool,
        conn_path= SQLITE_DB_NAME,
        pm_py_path= PM_PY_PATH,
        pm_sql_path=PM_SQL_PATH ), 
        functools.partial(backtrack_handler, max_backtracks=4))
py_combined_perfect.save("/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/Optimized_prompts/combined/combined_signatures_perfect.json") # save the signature
py_combined_perfect.pm_py.load(PM_PY_PATH) # load the demonstrations
py_combined_perfect.pm_sql.load(PM_SQL_PATH) # load the demonstrations
py_combined_perfect.save("/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/Optimized_prompts/combined/combined_signatures_perfect.json") # save including the demonstrations

In [106]:
score, outputs, scores = evaluate(program = py_combined_perfect, return_all_scores= True)

  0%|          | 0/1 [00:00<?, ?it/s]check Decision.NO
calling python module cols to generate:  ['appealed_to_judge', 'dismissed_by_judge', 'appeal_judge_cancelled']
length of dp 561470
column type of new column bool
pre_float_check True
pre_distinc_value_check True
calling python module cols to generate:  ['appealed_to_judge', 'dismissed_by_judge', 'appeal_judge_cancelled']
length of dp 561470
column type of new column bool
pre_float_check True
pre_distinc_value_check True
calling python module cols to generate:  ['appealed_to_judge', 'dismissed_by_judge', 'appeal_judge_cancelled']
length of dp 561470
column type of new column bool
pre_float_check True
pre_distinc_value_check True
finished calling python, now calling sql module
Average Metric: 2.00 / 1 (200.0%): 100%|██████████| 1/1 [01:24<00:00, 84.87s/it]

2025/01/10 13:41:39 INFO dspy.evaluate.evaluate: Average Metric: 2 / 1 (200.0%)





Unnamed: 0,question,example,req_cols,answer,LM_EVAL
0,How many cases have had their appeal rejected by the judge?,462,['appeal_judge_cancelled'],462,✔️ [2]


In [40]:
py_combined.pm_py.errors

defaultdict(list, {})

In [23]:
df_merged = save_report_v2(outputs,scores, "/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/Results_Combined/testset_pm_combined_decision_compiled_4o", py_combined, judge_adjusted)

check how many columns are incorrect

In [43]:
import numpy as np

In [44]:
# get the gold columns
gold = pd.read_csv("/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/benchmark/PM_EVALQUESTIONS_DF4.csv")
gold['time_timestamp'] = pd.to_datetime(gold['time_timestamp'])
gold['time_timestamp_beginn'] = pd.to_datetime(gold['time_timestamp_beginn'])
gold['time_timestamp_end'] = pd.to_datetime(gold['time_timestamp_end'])
gold['duration'] = pd.to_timedelta(gold['duration'], unit='s')
gold_sorted = gold.sort_values(by=['case_concept_name', 'time_timestamp'])
gold_sorted = gold_sorted.reset_index(drop=True)

# get the combined.db and convert to df
query = 'SELECT * FROM event_log'
conn = sqlite3.connect(SQLITE_DB_NAME)
tp = pd.read_sql_query(query, conn, parse_dates = ['time_timestamp', 'time_timestamp_beginn', 'time_timestamp_end'])
if "duration" in tp.columns:
    tp['duration'] = pd.to_timedelta(tp['duration'], unit = "s")
for cols in tp.columns:
    if tp[cols].isin([0,1]).all() and not cols.endswith("_count"):
        tp[cols] = tp[cols].astype(bool)
tp_sorted = tp.sort_values(by=['case_concept_name', 'time_timestamp'])

# convert NoneType to nan (float)
tp_sorted = tp_sorted.replace({None: np.nan})

In [45]:
column_name = []
length_equal = []
columns_equal = []
type_a = []
type_b = []
rows_different = []
check_df = pd.DataFrame(columns = ['column_name', 'length_equal', 'columns_equal', 'type_a', 'type_b','rows_different'])
for col in tp_sorted.columns:
    if col in gold_sorted.columns:
        # create a small df with the cols: column name, length of columns are equal, columns are equal, types of the columns (two separate colums),number of rows different
        column_name.append(col)
        length_equal.append(len(tp_sorted[col]) == len(gold_sorted[col]))
        columns_equal.append(tp_sorted[col].equals(gold_sorted[col]))
        type_a.append(tp_sorted[col].dtype)
        type_b.append(gold_sorted[col].dtype)
        non_equal_elements = tp_sorted[col] != gold_sorted[col]
        rows_different.append(non_equal_elements.sum())
        

check_df['column_name'] = column_name
check_df['length_equal'] = length_equal
check_df['columns_equal'] = columns_equal
check_df['type_a'] = type_a
check_df['type_b'] = type_b
check_df['rows_different'] = rows_different

check_df.head(50)

Unnamed: 0,column_name,length_equal,columns_equal,type_a,type_b,rows_different
0,amount,True,True,int64,int64,0
1,org_resource,True,False,object,float64,561470
2,dismissal,True,True,object,object,406404
3,concept_name,True,True,object,object,0
4,vehicleClass,True,True,object,object,411100
5,totalPaymentAmount,True,True,int64,int64,0
6,lifecycle_transition,True,True,object,object,0
7,time_timestamp,True,True,"datetime64[ns, UTC]","datetime64[ns, UTC]",0
8,article,True,True,float64,float64,411100
9,points,True,True,float64,float64,411100


# Isolated Program

Running the sql program on the vanilla event log

In [29]:
SQLITE_DB_ISOLATED = "isolated.db"

In [61]:
# reset rm and the database

conn = sqlite3.connect(SQLITE_DB_ISOLATED)
cur = conn.cursor()
cur.execute("DROP TABLE IF EXISTS temp_table;")

print(cur.fetchall())
conn.commit()

df.to_sql("event_log", conn, if_exists="replace", index=False)
cur = conn.cursor()
indexes = [
    "CREATE INDEX IF NOT EXISTS idx_case_concept_name ON event_log(case_concept_name);",
    "CREATE INDEX IF NOT EXISTS idx_concept_name ON event_log(concept_name);",
    "CREATE INDEX IF NOT EXISTS idx_timestamp ON event_log(time_timestamp);",
    "CREATE INDEX IF NOT EXISTS idx_totalPaymentAmount ON event_log(totalPaymentAmount);",
    "CREATE INDEX IF NOT EXISTS idx_expense ON event_log(expense);",
    "CREATE INDEX IF NOT EXISTS idx_amount ON event_log(amount);",
    "CREATE INDEX IF NOT EXISTS idx_case_totalPaymentAmount ON event_log(case_concept_name, totalPaymentAmount);",
    "CREATE INDEX IF NOT EXISTS idx_case_expense ON event_log(case_concept_name, expense);",
    "CREATE INDEX IF NOT EXISTS idx_case_amount ON event_log(case_concept_name, amount);",
    "CREATE INDEX IF NOT EXISTS idx_case_time ON event_log(case_concept_name, time_timestamp);",
    "CREATE INDEX IF NOT EXISTS idx_case_name ON event_log(case_concept_name, concept_name);",
    "CREATE INDEX IF NOT EXISTS idx_case_covering ON event_log(case_concept_name, amount, expense, totalPaymentAmount, time_timestamp);"
    

]
for index_query in indexes:
    cur.execute(index_query)
conn.commit()

cur.execute("ALTER TABLE event_log ADD COLUMN idx INTEGER;")
conn.commit()
cur.execute("""UPDATE event_log SET idx = (
    SELECT rowid FROM (
        SELECT ROW_NUMBER() OVER (ORDER BY case_concept_name, time_timestamp) as seq_num, rowid
        FROM event_log
    ) temp WHERE temp.rowid = event_log.rowid
);""")
conn.commit()
cur.execute("CREATE INDEX idx_event_log_idx ON event_log(idx);")
conn.commit()
cur.execute("VACUUM;")
cur.execute("ANALYZE;")
conn.commit()
cur.close()

# Initialize the Chroma retriever

col_desc = """- 'amount' (int): The amount due to be paid for the fine (including the penalty amount in case it is added). There are no nan values in this column.
- 'org_resource' (int): A numeric code indicating the employee who handled the case.
- 'dismissal' (string): A flag indicating whether and by whom the fine is dismissed. It is initialized to NIL. We know the meaning of:  
        'G': dismissed by the judge
        '#': dismissed by the prefecture
        NIL: not dismissed, i.e., to be paid.
        There are several other values used for which we do not know the semantics.
- 'vehicleClass' (string): A flag indicating the kind of vehicle driven or owned by the offender. The semantics of the values is unknown.
- 'totalPaymentAmount' (int): The cumulative amount paid by the offender. It is always initialized to 0. There are no nan values in this column.
- 'lifecycle_transition' (string): the transition of the activity (complete, start, etc.)
- 'article' (string): The number of the article of the Italian roadtraffic law that is violated by the offender (e.g., article 157 refers to stopping and parking vehicles).
- 'points' (float): Penalty points deducted from the driving license. In Italy, each driver starts with 20 points on their license and may loose points for each offence, based on the gravity.
- 'expense' (int): The additional amount due to be paid for postal expenses. There are no nan values in this column.
- 'notificationType' (string): A flag indicating to whom the fine refers. 'P': car owner, 'C': car driver.
- 'lastSent' (datetime): N/A
- 'paymentAmount' (int): The amount paid by the offender in one transaction. There are no nan values in this column.
- 'matricola' (string): N/A (Probably refers to the matriculation number of the car.)
- 'concept_name' (string): the activity/ event type name
        Activity Description, column: 'concept:name':
            > 'Create Fine': The initial creation of the fine in the information system. It initializes event log attributes amount, dismissal, points and totalPaymentAmount.
            > 'Send Fine': A notification about the fine is sent by post to the offender.
            > 'Insert Fine Notification': The notification is received by the offender.
            > 'Add penalty': An additional penalty is applied.
            > 'Payment': A payment made by the offender is registered.
            > 'Send for Credit Collection': Unpaid fines are sent for credit collection. A separate process is started by a collection agency to collect the money of the unpaid fines.
            > 'Insert Date Appeal to Prefecture': The offender appeals against the fine to the prefecture. A prefecture in Italy is an administrative body representing the national government in each province.
            > 'Send Appeal to Prefecture': The appeal is sent to the prefecture by the local police.
            > 'Receive Result Appeal from Prefecture': The local police receives the result of the appeal. If the prefecture dismisses the fine, the appeal is deemed accepted, and the obligation to pay the fine is cancelled. In this case, there is no need for the police to receive the result from the prefecture (Receive Result Appeal from Prefecture) and notify the offender (Notify Result Appeal to Offender).
            > 'Notify Result Appeal to Offender': The local police informs the offender of the appeal result. 
            > 'Appeal to Judge': The offender appeals against the fine to a judge.
        IMPORTANT: The last event in a case can be arbitrary. There is no guarantee that the last event is 'Send Fine' or 'Payment'. The last event can be any event in the log."""

rm = Chroma(sentence_transformer_ef = sentence_transformer_ef , documentation = col_desc) # for python, use standard cols


[]
["- 'amount' (int): The amount due to be paid for the fine (including the penalty amount in case it is added). There are no nan values in this column.", "- 'org_resource' (int): A numeric code indicating the employee who handled the case.", "- 'dismissal' (string): A flag indicating whether and by whom the fine is dismissed. It is initialized to NIL. We know the meaning of:  \n        'G': dismissed by the judge\n        '#': dismissed by the prefecture\n        NIL: not dismissed, i.e., to be paid.\n        There are several other values used for which we do not know the semantics.", "- 'vehicleClass' (string): A flag indicating the kind of vehicle driven or owned by the offender. The semantics of the values is unknown.", "- 'totalPaymentAmount' (int): The cumulative amount paid by the offender. It is always initialized to 0. There are no nan values in this column.", "- 'lifecycle_transition' (string): the transition of the activity (complete, start, etc.)", "- 'article' (string): 

In [62]:

col_instructions = pd.read_csv(COLUMN_INSTRUCTIONS)

dp_graph = DependencyGraph(rm.cols, col_instructions) # check what is in rm.cols (it appends the timestamp and case concept name columns)

['amount',
 'org_resource',
 'dismissal',
 'vehicleClass',
 'totalPaymentAmount',
 'lifecycle_transition',
 'article',
 'points',
 'expense',
 'notificationType',
 'lastSent',
 'paymentAmount',
 'matricola',
 'concept_name',
 'case_concept_name',
 'time_timestamp']

In [31]:
# create a conn pool
from queue import Queue
import threading


class SQLiteConnectionPool:
    def __init__(self, database, max_size=10):
        self.database = database
        self.pool = Queue(maxsize=max_size)
        for _ in range(max_size):
            self.pool.put(self.create_new_connection())

    def create_new_connection(self):
        return sqlite3.connect(self.database, check_same_thread=False)

    def get_connection(self):
        return self.pool.get()

    def release_connection(self, conn):
        self.pool.put(conn)

pool = SQLiteConnectionPool(SQLITE_DB_NAME, max_size=10)

In [32]:
def save_report_sql(output,scores,filename, program, e_metric):
    qs = []
    exs = []
    preds = []
    for i in range(len(output)):
        qs.append(output[i][0]["question"])
        exs.append(output[i][0]["example"])
        try:
            preds.append(output[i][1]["answer"])
        except:
            preds.append("No Answer Possible")

    qa, es, tb= program.get_history()
    re = e_metric.get_reasoning()

    df_merged = pd.DataFrame({"question": qs, "example": exs, "prediction": preds, "SCORE": scores})
    df_merged['reasoning'] = df_merged['question'].map(re)
    df_merged['queries'] = df_merged['question'].map(qa)
    df_merged['errors'] = df_merged['question'].map(es)
    df_merged['table'] = df_merged['question'].map(tb)
    df_merged.to_csv(f"{filename}.csv", index=False)
    return df_merged

In [39]:
# reading the gold answers
qa_sql = pd.read_csv("/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/benchmark/q_a_final.csv")


dataset = []

for question, answer, _ in qa_sql.values:
    dataset.append(dspy.Example(question = question, example = answer).with_inputs("question"))

train_test_split = pd.read_csv('/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/benchmark/sql_questions_to_splitt.csv')
trainset_sql = []
testset_sql = []

for i in range(len(dataset)):
    try:
        obj = train_test_split.loc[train_test_split['Question'] == dataset[i].question]["Split"][i]
    except:
        print("error at", i)

    if obj == "train":
        trainset_sql.append(dataset[i])
    elif obj == "test":
        testset_sql.append(dataset[i])

print("trainset", len(trainset_sql))
print("testset", len(testset_sql))
print("dataset", len(dataset))

trainset 78
testset 103
dataset 181


In [77]:
judge_lm = dspy.LM(model='openai/gpt-4o', temperature=1, max_tokens=3000, stop=None, cache=True, seed= 99)

judge_adjusted = LM_EVAL(judge_lm)
judge_adjusted.load(JUDGE_PATH)

In [41]:
lm = dspy.LM('openai/gpt-4o', temperature=0.3, max_tokens=4096, stop=None, cache=True, seed=512)
dspy.settings.configure(lm = lm)

In [78]:
lm = dspy.LM('openai/gpt-4o-mini', temperature=0.3, max_tokens=4096, stop=None, cache=True, seed=530)
dspy.settings.configure(lm = lm)

In [79]:
# initialize the isolated program

sql_isolated = PM_SQL_multi_sp(pool=pool, rm=rm).activate_assertions()
sql_isolated.load(PM_SQL_PATH)

In [80]:
evaluate = Evaluate(devset=testset_sql, metric=judge_adjusted, num_threads=7, display_progress=True, display_table=len(testset_sql), return_outputs=True, max_errors=10)

In [81]:
score, outputs, scores = evaluate(program = sql_isolated, return_all_scores= True)

Average Metric: 92.00 / 92 (100.0%):  89%|████████▉ | 92/103 [17:24<01:13,  6.69s/it]  

2024/12/28 18:45:27 INFO dspy.primitives.assertions: SuggestionFailed: Your reasoning should be fewer than 1400 characters long


Average Metric: 92.00 / 93 (98.9%):  90%|█████████ | 93/103 [17:44<01:44, 10.46s/it] 

2024/12/28 18:45:44 INFO dspy.primitives.assertions: SuggestionFailed: Your reasoning should be fewer than 1400 characters long


Average Metric: 96.00 / 96 (100.0%):  93%|█████████▎| 96/103 [18:16<01:11, 10.24s/it]

2024/12/28 18:46:17 INFO dspy.primitives.assertions: SuggestionFailed: Your reasoning should be fewer than 1400 characters long


Average Metric: 102.00 / 103 (99.0%): 100%|██████████| 103/103 [19:21<00:00, 11.28s/it]

2024/12/28 18:47:16 INFO dspy.evaluate.evaluate: Average Metric: 102 / 103 (99.0%)





Unnamed: 0,question,example,answer,LM_EVAL
0,How many events are in the log?,561470,561470,✔️ [2]
1,When is the start of the event log?,1/1/2000 / 2000-01-01,2000-01-01 00:00:00+00:00,✔️ [2]
2,When is the end of the event log?,6/18/2013/ 2013-06-18,2013-06-18 00:00:00+00:00,✔️ [2]
3,How many cases have sent an appeal to the Prefecture?,4141,4188,✔️ [1]
4,How many event types are there?,11,11,✔️ [2]
...,...,...,...,...
98,Which case is underpaid == True AND part_paid == True AND payment_...,C23364,case_concept_name,
99,How many cases have undresolved == True AND obligation_topay_cance...,13395,28984,
100,How many cases have unresolved == True AND obligation_topay_cancel...,2,0,
101,How many cases are credit collected before 2002-12-24?,9383,9383,✔️ [2]


In [82]:
df_merged = save_report_sql(outputs,scores, "/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/Results_Combined/testset_pm_isolated_c_mini", sql_isolated, judge_adjusted)

eval of isolated program

In [45]:
## Save resport for isolated

def save_report_isolated(output,scores,filename, program, e_metric):
    qs = []
    exs = []
    preds = []
    for i in range(len(output)):
        qs.append(output[i][0]["question"])
        exs.append(output[i][0]["example"])
        try:
            preds.append(output[i][1]["answer"])
        except:
            preds.append("No Answer Possible")

    qa, es, tb, rh = program.get_history()
    re = e_metric.get_reasoning()

    df_merged = pd.DataFrame({"question": qs, "example": exs, "prediction": preds, "SCORE": scores})
    df_merged['reasoning'] = df_merged['question'].map(re)
    df_merged['queries'] = df_merged['question'].map(qa)
    df_merged['errors'] = df_merged['question'].map(es)
    df_merged['table'] = df_merged['question'].map(tb)
    df_merged['reasoning_hist'] = df_merged['question'].map(rh)
    df_merged.to_csv(f"{filename}.csv", index=False)
    return df_merged

In [71]:
judge_lm = dspy.LM(model='openai/gpt-4o', temperature=1, max_tokens=3000, stop=None, cache=True, seed= 99)

judge_adjusted = LM_EVAL(judge_lm)
judge_adjusted.load(JUDGE_PATH)

In [72]:
lm = dspy.LM('openai/gpt-4o-mini', temperature=0.3, max_tokens=4096, stop=None, cache=True, seed=515)
dspy.settings.configure(lm = lm)

In [73]:
sql_isolated_definitions = PM_isolated(pool=pool, rm=rm, dp_graph=dp_graph).activate_assertions()


In [52]:
testset[0]

Example({'question': 'How many events are in the log?', 'example': '561470', 'req_cols': nan}) (input_keys={'question', 'req_cols'})

In [74]:
evaluate = Evaluate(devset=testset, metric=judge_adjusted, num_threads=7, display_progress=True, display_table=len(testset), return_outputs=True, max_errors=10)

In [75]:
score, outputs, scores = evaluate(program = sql_isolated_definitions, return_all_scores= True)

  0%|          | 0/103 [00:00<?, ?it/s]Query executed successfully
output [[('2000-01-01 00:00:00+00:00',)], (('start_of_event_log', None, None, None, None, None, None),)]
Average Metric: 2.00 / 1 (200.0%):   1%|          | 1/103 [00:04<08:23,  4.94s/it]Query executed successfully
output [[('2013-06-18 00:00:00+00:00',)], (('end_of_event_log', None, None, None, None, None, None),)]
Average Metric: 4.00 / 2 (200.0%):   2%|▏         | 2/103 [00:09<07:27,  4.43s/it]Query executed successfully
output [[(4141,)], (('total_cases_sent_appeal', None, None, None, None, None, None),)]
Average Metric: 6.00 / 3 (200.0%):   3%|▎         | 3/103 [00:21<13:23,  8.03s/it]Query executed successfully
output [[(561470,)], (('total_events', None, None, None, None, None, None),)]
Average Metric: 8.00 / 4 (200.0%):   4%|▍         | 4/103 [00:24<10:23,  6.30s/it]Query executed successfully
output [[(11,)], (('distinct_event_types', None, None, None, None, None, None),)]
Average Metric: 10.00 / 5 (200.0%):   

2024/12/28 18:09:25 INFO dspy.primitives.assertions: SuggestionFailed: Error executing SQLite query Query execution exceeded the time limit. Use a query that executes faster.


Query execution failed: misuse of aggregate function MIN()
query timed out


2024/12/28 18:09:30 INFO dspy.primitives.assertions: SuggestionFailed: Error executing SQLite query Query execution exceeded the time limit. Use a query that executes faster.


Query execution failed: misuse of aggregate function MIN()
query timed out


2024/12/28 18:09:36 INFO dspy.primitives.assertions: SuggestionFailed: Error executing SQLite query Query execution exceeded the time limit. Use a query that executes faster.


Query execution failed: misuse of aggregate function MIN()
query timed out
Average Metric: 94.00 / 58 (162.1%):  56%|█████▋    | 58/103 [13:00<24:47, 33.06s/it]Query executed successfully
output [[(0,)], (('case_count', None, None, None, None, None, None),)]
Average Metric: 96.00 / 59 (162.7%):  57%|█████▋    | 59/103 [13:07<18:22, 25.06s/it]Query executed successfully
output [[(60304,)], (('fully_paid_cases', None, None, None, None, None, None),)]
Average Metric: 98.00 / 60 (163.3%):  58%|█████▊    | 60/103 [13:33<18:12, 25.40s/it]Query executed successfully
output [[(40.10374409789187,)], (('percentage_fully_paid', None, None, None, None, None, None),)]
Average Metric: 100.00 / 61 (163.9%):  59%|█████▉    | 61/103 [14:05<19:15, 27.51s/it]Query executed successfully
output [[(3,)], (('case_count', None, None, None, None, None, None),)]
Average Metric: 102.00 / 62 (164.5%):  60%|██████    | 62/103 [14:38<19:45, 28.92s/it]

2024/12/28 18:11:47 INFO dspy.primitives.assertions: SuggestionFailed: Error executing SQLite query Query execution exceeded the time limit. Use a query that executes faster.


Query execution failed: misuse of aggregate function MAX()
query timed out


2024/12/28 18:11:50 INFO dspy.primitives.assertions: SuggestionFailed: Error executing SQLite query Query execution exceeded the time limit. Use a query that executes faster.


Query execution failed: misuse of aggregate function MAX()
query timed out


2024/12/28 18:11:50 INFO dspy.primitives.assertions: SuggestionFailed: Error executing SQLite query Query execution exceeded the time limit. Use a query that executes faster.


Query execution failed: misuse of aggregate function MAX()
query timed out
Average Metric: 102.00 / 63 (161.9%):  61%|██████    | 63/103 [15:14<20:49, 31.23s/it]Query executed successfully
output [[(28984,)], (('unresolved_case_count', None, None, None, None, None, None),)]
Average Metric: 104.00 / 64 (162.5%):  62%|██████▏   | 64/103 [15:46<20:20, 31.29s/it]

2024/12/28 18:12:53 INFO dspy.primitives.assertions: SuggestionFailed: Error executing SQLite query Query execution exceeded the time limit. Use a query that executes faster.


Query execution failed: misuse of aggregate function MAX()
query timed out


2024/12/28 18:12:58 INFO dspy.primitives.assertions: SuggestionFailed: Error executing SQLite query Query execution exceeded the time limit. Use a query that executes faster.


Query execution failed: misuse of aggregate function MAX()
query timed out


2024/12/28 18:12:58 INFO dspy.primitives.assertions: SuggestionFailed: Error executing SQLite query Query execution exceeded the time limit. Use a query that executes faster.


Query execution failed: misuse of aggregate function MAX()
query timed out
Average Metric: 104.00 / 65 (160.0%):  63%|██████▎   | 65/103 [16:22<20:51, 32.94s/it]Query executed successfully
output [[(66373,)], (('case_count', None, None, None, None, None, None),)]
Average Metric: 106.00 / 66 (160.6%):  64%|██████▍   | 66/103 [16:27<15:00, 24.35s/it]Query executed successfully
output [[(23728,)], (('case_count', None, None, None, None, None, None),)]
Average Metric: 108.00 / 67 (161.2%):  65%|██████▌   | 67/103 [16:39<12:21, 20.60s/it]Query executed successfully
output [[(76,)], (('case_count', None, None, None, None, None, None),)]
Average Metric: 110.00 / 68 (161.8%):  66%|██████▌   | 68/103 [16:58<11:45, 20.15s/it]Query executed successfully
output [[(9,)], (('case_count', None, None, None, None, None, None),)]
Average Metric: 112.00 / 69 (162.3%):  67%|██████▋   | 69/103 [17:04<09:06, 16.06s/it]Query executed successfully
output [[(80670,)], (('case_count', None, None, None, None, No

2024/12/28 18:20:47 INFO dspy.primitives.assertions: SuggestionFailed: Error executing SQLite query Query execution exceeded the time limit. Use a query that executes faster.


Query execution failed: no such column: totalPaymentAmount
query timed out
Query executed successfully
output [[(28598,)], (('case_count', None, None, None, None, None, None),)]
Average Metric: 154.00 / 92 (167.4%):  89%|████████▉ | 92/103 [24:12<04:24, 24.01s/it]

2024/12/28 18:21:18 INFO dspy.primitives.assertions: SuggestionFailed: Error executing SQLite query Query execution exceeded the time limit. Use a query that executes faster.


Query execution failed: misuse of aggregate function MIN()
query timed out


2024/12/28 18:21:26 INFO dspy.primitives.assertions: SuggestionFailed: Error executing SQLite query Query execution exceeded the time limit. Use a query that executes faster.


Query execution failed: misuse of aggregate function MIN()
query timed out


2024/12/28 18:21:32 INFO dspy.primitives.assertions: SuggestionFailed: Error executing SQLite query Query execution exceeded the time limit. Use a query that executes faster.


Query execution failed: no such column: totalPaymentAmount
query timed out
Average Metric: 154.00 / 93 (165.6%):  90%|█████████ | 93/103 [24:56<05:01, 30.13s/it]

2024/12/28 18:21:40 INFO dspy.primitives.assertions: SuggestionFailed: Error executing SQLite query Query execution exceeded the time limit. Use a query that executes faster.


Query execution failed: misuse of aggregate function MAX()
query timed out


2024/12/28 18:21:49 INFO dspy.primitives.assertions: SuggestionFailed: Error executing SQLite query Query execution exceeded the time limit. Use a query that executes faster.


Query execution failed: no such column: concept_name
query timed out


2024/12/28 18:21:58 INFO dspy.primitives.assertions: SuggestionFailed: Error executing SQLite query Query execution exceeded the time limit. Use a query that executes faster.


Query execution failed: no such column: concept_name
query timed out
Average Metric: 154.00 / 94 (163.8%):  91%|█████████▏| 94/103 [25:22<04:19, 28.80s/it]Query executed successfully
output [[(130,)], (('case_count', None, None, None, None, None, None),)]
Average Metric: 154.00 / 95 (162.1%):  92%|█████████▏| 95/103 [25:30<03:00, 22.52s/it]Query executed successfully
output [[(5,)], (('case_count', None, None, None, None, None, None),)]
Average Metric: 156.00 / 96 (162.5%):  93%|█████████▎| 96/103 [25:56<02:45, 23.63s/it]Query executed successfully
output [[(0,)], (('case_count', None, None, None, None, None, None),)]
Average Metric: 158.00 / 97 (162.9%):  94%|█████████▍| 97/103 [26:23<02:28, 24.67s/it]Query executed successfully
output [[(5,)], (('case_count', None, None, None, None, None, None),)]
Average Metric: 160.00 / 98 (163.3%):  95%|█████████▌| 98/103 [26:52<02:09, 25.92s/it]Query executed successfully
output [[('C23364',)], (('case_concept_name', None, None, None, None, None,

2024/12/28 18:24:48 INFO dspy.primitives.assertions: SuggestionFailed: Error executing SQLite query Query execution exceeded the time limit. Use a query that executes faster.


Query execution failed: no such column: unresolved
query timed out


2024/12/28 18:24:53 INFO dspy.primitives.assertions: SuggestionFailed: Error executing SQLite query Query execution exceeded the time limit. Use a query that executes faster.


Query execution failed: no such column: e2.time_timestamp
query timed out


2024/12/28 18:24:59 INFO dspy.primitives.assertions: SuggestionFailed: Error executing SQLite query Query execution exceeded the time limit. Use a query that executes faster.


Query execution failed: no such column: e2.time_timestamp
query timed out
Average Metric: 162.00 / 100 (162.0%):  97%|█████████▋| 100/103 [28:23<01:51, 37.19s/it]

2024/12/28 18:25:11 INFO dspy.primitives.assertions: SuggestionFailed: Error executing SQLite query Query execution exceeded the time limit. Use a query that executes faster.


Query execution failed: misuse of aggregate function MIN()
query timed out


2024/12/28 18:25:20 INFO dspy.primitives.assertions: SuggestionFailed: Error executing SQLite query Query execution exceeded the time limit. Use a query that executes faster.


Query execution failed: misuse of aggregate function MIN()
query timed out


2024/12/28 18:25:27 INFO dspy.primitives.assertions: SuggestionFailed: Error executing SQLite query Query execution exceeded the time limit. Use a query that executes faster.


Query execution failed: misuse of aggregate function MIN()
query timed out
Average Metric: 162.00 / 101 (160.4%):  98%|█████████▊| 101/103 [28:52<01:09, 34.56s/it]Query executed successfully
output [[(9383,)], (('case_count', None, None, None, None, None, None),)]
Average Metric: 164.00 / 102 (160.8%):  99%|█████████▉| 102/103 [28:59<00:26, 26.25s/it]Query executed successfully
output [[(9383,)], (('case_count', None, None, None, None, None, None),)]
Average Metric: 166.00 / 103 (161.2%): 100%|██████████| 103/103 [29:24<00:00, 17.13s/it]

2024/12/28 18:26:02 INFO dspy.evaluate.evaluate: Average Metric: 166 / 103 (161.2%)





Unnamed: 0,question,example,req_cols,answer,LM_EVAL
0,How many events are in the log?,561470,,561470,✔️ [2]
1,When is the start of the event log?,1/1/2000 / 2000-01-01,,2000-01-01 00:00:00+00:00,✔️ [2]
2,When is the end of the event log?,6/18/2013/ 2013-06-18,,2013-06-18 00:00:00+00:00,✔️ [2]
3,How many cases have sent an appeal to the Prefecture?,4141,,4141,✔️ [2]
4,How many event types are there?,11,,There are 11 distinct event types.,✔️ [2]
...,...,...,...,...,...
98,Which case is underpaid == True AND part_paid == True AND payment_...,C23364,"['underpaid', 'part_paid', 'payment_count', 'underpaid_amount', 'c...",C23364,✔️ [2]
99,How many cases have undresolved == True AND obligation_topay_cance...,13395,"['unresolved', 'obligation_topay_cancelled']","There was an error during the execution of the SQL query, which re...",
100,How many cases have unresolved == True AND obligation_topay_cancel...,2,"['unresolved', 'obligation_topay_cancelled', 'underpaid_amount', '...",There was an error during the answering of the question due to the...,
101,How many cases are credit collected before 2002-12-24?,9383,"['credit_collected', 'time_timestamp_end']",9383,✔️ [2]


In [76]:
df_merged = save_report_isolated(outputs,scores, "/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/Results_Combined/testset_pm_isolated_definitions_uc_mini", sql_isolated_definitions, judge_adjusted)

creating indices in the db to speed things up a little bit

In [None]:
creating