In [141]:
import sqlite3
import pandas as pd
import dspy
import dotenv, os
from pydantic import BaseModel, Field
import openai
import pm4py
import ast
from numpy import random
from dspy.evaluate import Evaluate
from collections import defaultdict
import tqdm as notebook_tqdm
import copy
import re
from dspy.teleprompt import BootstrapFewShot
import csv
import phoenix as px
from openinference.instrumentation.dspy import DSPyInstrumentor
from opentelemetry import trace as trace_api
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk import trace as trace_sdk
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
from dspy.primitives.assertions import assert_transform_module, backtrack_handler
from chroma_retriever import Chroma
from chromadb.utils import embedding_functions
import numpy as np
import functools
from PY_programs.python_tables import PM_PY_no_deep
from PY_programs.python_simple import PM_PY_simple

# 0.Data loading

In [2]:
INPUT_FILE_NAME = "/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/dataset/Road_Traffic_Fine_Management_Process.xes" #replce with your file path
SQLITE_DB_NAME = "python_test.db" #"my_database.db" #leve as is
LLM_MODEL_TYPE = "gpt-4o"#"gpt-3.5-turbo-0125" #leave as is for gpt 3.5 or change to "gpt-4-turbo" for gpt 4 turbo
SQLITE_DB_EVAL_NAME = "python_eval.db"

In [3]:
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2")

In [165]:
dotenv.load_dotenv()
lm = dspy.LM('openai/gpt-4o', temperature=0.3, max_tokens=4096, stop=None, cache=False)
dspy.settings.configure(lm = lm)

In [52]:
df = pm4py.read_xes(INPUT_FILE_NAME)
df.columns = df.columns.str.replace(':', '_', regex=False)
# replace nan values with 0 in columns "amount", "expense", "paymentAmount", "totalPaymentAmount"
df['amount'] = df['amount'].fillna(0)
df['expense'] = df['expense'].fillna(0)
df['paymentAmount'] = df['paymentAmount'].fillna(0)
df['totalPaymentAmount'] = df['totalPaymentAmount'].fillna(0)
#fturn euros into cents
df['amount'] = df['amount'] * 100
df['amount'] = df['amount'].astype(int)
df['expense'] = df['expense'] * 100
df['expense'] = df['expense'].astype(int)
df['paymentAmount'] = df['paymentAmount'] * 100
df['paymentAmount'] = df['paymentAmount'].astype(int)
df['totalPaymentAmount'] = df['totalPaymentAmount'] * 100
df['totalPaymentAmount'] = df['totalPaymentAmount'].astype(int)

conn = sqlite3.connect(SQLITE_DB_NAME)
cur = conn.cursor()
cur.execute("DROP TABLE IF EXISTS temp_table;")

print(cur.fetchall())
conn.commit()

df.to_sql("event_log", conn, if_exists="replace", index=False)
cur = conn.cursor()
cur.execute("CREATE INDEX idx_case_concept_name_event_log ON event_log(case_concept_name)")
conn.commit()

cur.execute("ALTER TABLE event_log ADD COLUMN idx INTEGER;")
conn.commit()
cur.execute("""UPDATE event_log SET idx = (
    SELECT rowid FROM (
        SELECT ROW_NUMBER() OVER (ORDER BY case_concept_name, time_timestamp) as seq_num, rowid
        FROM event_log
    ) temp WHERE temp.rowid = event_log.rowid
);""")
conn.commit()
cur.execute("CREATE INDEX idx_event_log_idx ON event_log(idx);")
conn.commit()
cur.execute("VACUUM;")
cur.execute("ANALYZE;")
conn.commit()
cur.close()

parsing log, completed traces ::   0%|          | 0/150370 [00:00<?, ?it/s]

[]


In [7]:
phoenix_session = px.launch_app()
endpoint = "http://127.0.0.1:6006/v1/traces"
resource = Resource(attributes={})
tracer_provider = trace_sdk.TracerProvider(resource=resource)
span_otlp_exporter = OTLPSpanExporter(endpoint=endpoint)
tracer_provider.add_span_processor(SimpleSpanProcessor(span_exporter=span_otlp_exporter))

trace_api.set_tracer_provider(tracer_provider=tracer_provider)
DSPyInstrumentor().instrument()

E0000 00:00:1735216702.722517 13503914 chttp2_server.cc:1177] UNKNOWN:No address added out of total 1 resolved for '[::]:4317' {created_time:"2024-12-26T13:38:22.722512+01:00", children:[UNKNOWN:Failed to add any wildcard listeners {created_time:"2024-12-26T13:38:22.722499+01:00", children:[UNKNOWN:Unable to configure socket {created_time:"2024-12-26T13:38:22.722471+01:00", fd:95, children:[UNKNOWN:bind: Address already in use (48) {created_time:"2024-12-26T13:38:22.722446+01:00"}]}, UNKNOWN:Unable to configure socket {fd:95, created_time:"2024-12-26T13:38:22.722497+01:00", children:[UNKNOWN:bind: Address already in use (48) {created_time:"2024-12-26T13:38:22.72249+01:00"}]}]}]}
ERROR:    Traceback (most recent call last):
  File "/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/dspy_v2/lib/python3.12/site-packages/starlette/routing.py", line 732, in lifespan
    async with self.lifespan_context(app) as maybe_state:
               ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/Cellar

🌍 To view the Phoenix app in your browser, visit http://localhost:6006/
📖 For more information on how to use Phoenix, check out https://docs.arize.com/phoenix


In [171]:
col_desc = """- 'amount' (int): The amount due to be paid for the fine (including the penalty amount in case it is added). There are no nan values in this column.
- 'org_resource' (int): A numeric code indicating the employee who handled the case.
- 'dismissal' (string): A flag indicating whether and by whom the fine is dismissed. It is initialized to NIL. We know the meaning of:  
        'G': dismissed by the judge
        '#': dismissed by the prefecture
        NIL: not dismissed, i.e., to be paid.
        There are several other values used for which we do not know the semantics.
- 'vehicleClass' (string): A flag indicating the kind of vehicle driven or owned by the offender. The semantics of the values is unknown.
- 'totalPaymentAmount' (int): The cumulative amount paid by the offender. It is always initialized to 0. There are no nan values in this column.
- 'lifecycle_transition' (string): the transition of the activity (complete, start, etc.)
- 'article' (string): The number of the article of the Italian roadtraffic law that is violated by the offender (e.g., article 157 refers to stopping and parking vehicles).
- 'points' (float): Penalty points deducted from the driving license. In Italy, each driver starts with 20 points on their license and may loose points for each offence, based on the gravity.
- 'expense' (int): The additional amount due to be paid for postal expenses. There are no nan values in this column.
- 'notificationType' (string): A flag indicating to whom the fine refers. 'P': car owner, 'C': car driver.
- 'lastSent' (datetime): N/A
- 'paymentAmount' (int): The amount paid by the offender in one transaction. There are no nan values in this column.
- 'matricola' (string): N/A (Probably refers to the matriculation number of the car.)
- 'concept_name' (string): the activity/ event type name
        Activity Description, column: 'concept:name':
            > 'Create Fine': The initial creation of the fine in the information system. It initializes event log attributes amount, dismissal, points and totalPaymentAmount.
            > 'Send Fine': A notification about the fine is sent by post to the offender.
            > 'Insert Fine Notification': The notification is received by the offender.
            > 'Add penalty': An additional penalty is applied.
            > 'Payment': A payment made by the offender is registered.
            > 'Send for Credit Collection': Unpaid fines are sent for credit collection. A separate process is started by a collection agency to collect the money of the unpaid fines.
            > 'Insert Date Appeal to Prefecture': The offender appeals against the fine to the prefecture. A prefecture in Italy is an administrative body representing the national government in each province.
            > 'Send Appeal to Prefecture': The appeal is sent to the prefecture by the local police.
            > 'Receive Result Appeal from Prefecture': The local police receives the result of the appeal. If the prefecture dismisses the fine, the appeal is deemed accepted, and the obligation to pay the fine is cancelled. In this case, there is no need for the police to receive the result from the prefecture (Receive Result Appeal from Prefecture) and notify the offender (Notify Result Appeal to Offender).
            > 'Notify Result Appeal to Offender': The local police informs the offender of the appeal result. 
            > 'Appeal to Judge': The offender appeals against the fine to a judge.
        IMPORTANT: The last event in a case can be arbitrary. There is no guarantee that the last event is 'Send Fine' or 'Payment'. The last event can be any event in the log."""

In [172]:
# initialize the chroma retriever
rm = Chroma(sentence_transformer_ef = sentence_transformer_ef , documentation = col_desc) # for python, use standard cols

["- 'amount' (int): The amount due to be paid for the fine (including the penalty amount in case it is added). There are no nan values in this column.", "- 'org_resource' (int): A numeric code indicating the employee who handled the case.", "- 'dismissal' (string): A flag indicating whether and by whom the fine is dismissed. It is initialized to NIL. We know the meaning of:  \n        'G': dismissed by the judge\n        '#': dismissed by the prefecture\n        NIL: not dismissed, i.e., to be paid.\n        There are several other values used for which we do not know the semantics.", "- 'vehicleClass' (string): A flag indicating the kind of vehicle driven or owned by the offender. The semantics of the values is unknown.", "- 'totalPaymentAmount' (int): The cumulative amount paid by the offender. It is always initialized to 0. There are no nan values in this column.", "- 'lifecycle_transition' (string): the transition of the activity (complete, start, etc.)", "- 'article' (string): The

In [173]:
rm.add_new("- 'obligation_topay_cancelled' (int): A boolean indicator (stored as an integer) showing whether the obligation to pay the fine is cancelled due to the time difference between 'Create Fine' and 'Send Fine' being more than 90 days. 1 for cancelled, 0 otherwise. This value is consistent across all rows pertaining to the same case.")

# Saving Class

In [10]:
def save_report_PM(outputs,scores,filename, program, conn = conn):
    inst_lst = []
    col_lst = []
    dfn_lst = []
    example_lst = []
    rationale_lst = []
    description_lst = []
    for i in range(len(outputs)):
        inst_lst.append(outputs[i][0].instruction)
        col_lst.append(outputs[i][0].column)
        dfn_lst.append(outputs[i][0].df_name)
        example_lst.append(outputs[i][0].example)
        try:
            rationale_lst.append(outputs[i][1].rationale)
        except:
            rationale_lst.append("No rationale")
        try:
            description_lst.append(outputs[i][1].description)
        except:
            description_lst.append("No description")
    code, errors, descriptions = program.get_history()
    code_1 = []
    code_2 = []
    code_3 = []
    for i in inst_lst:
        try:
            code_1.append(code[i][0])
        except:
            code_1.append("No code")
        try:
            code_2.append(code[i][1])
        except:
            code_2.append("No code")
        try:
            code_3.append(code[i][2])
        except:
            code_3.append("No code")
    error_lst = []
    for i in inst_lst:
        error_lst.append(errors[i])
    query = 'SELECT * FROM event_log'
    du = pd.read_sql_query(query, conn, parse_dates = ['time_timestamp'])
    if "duration" in du.columns:
        du['duration'] = pd.to_timedelta(du['duration'], unit = "s")
    for cols in du.columns:
        if du[cols].isin([0,1]).all() and not cols.endswith("_count"):
                du[cols] = du[cols].astype(bool)
    du_sorted = du.sort_values(by=['case_concept_name', 'time_timestamp'])
    g_dfs = []
    for i in col_lst:
        try:
            pred_col = du_sorted[i]
        except:
            pred_col = "error"
        g_dfs.append(pred_col)
    lst_non_equal = []
    for i in range(len(example_lst)):
        try:
            non_equal_elements = example_lst[i] != g_dfs[i]
            lst_non_equal.append(non_equal_elements.sum())
        except:
            lst_non_equal.append("error")
    df_merged = pd.DataFrame({'Instruction': inst_lst, 'Column': col_lst, 'df_name': dfn_lst, 'Example': example_lst, 'Rationale': rationale_lst, 'Description': description_lst, "PM_Metric": scores, 'Code 1': code_1, 'Code 2': code_2, 'Code 3': code_3, "errors": error_lst, 'pred_col': g_dfs, "num_rows_different": lst_non_equal})
    df_merged.to_csv(f"{filename}.csv", index=False, quoting=csv.QUOTE_ALL, escapechar='\\')
    return df_merged

# Metric

In [174]:
def PM_Metric(example, prediction, trace = None, conn=conn):
    col = example.column
    question = example.instruction
    df_name = example.df_name
    gold_col = example.example
    
    query = 'SELECT * FROM event_log'
    try:
        tp = pd.read_sql_query(query, conn, parse_dates = ['time_timestamp'])

        if "duration" in tp.columns:
            tp['duration'] = pd.to_timedelta(tp['duration'], unit = "s")
    except Exception as e:
        print(f"Error executing SQL query: {str(e)}")
        return False
    for cols in tp.columns:
        if tp[cols].isin([0,1]).all() and not cols.endswith("_count"):
            tp[cols] = tp[cols].astype(bool)
    
    tp_sorted = tp.sort_values(by=['case_concept_name', 'time_timestamp'])

    # if gold_col is of type timedelta, convert to int (seconds)
    #if gold_col.dtype == '<m8[ns]':
    #    gold_col = gold_col.dt.total_seconds().astype(int)

    try:
        
        pred_col = tp_sorted[col]
    except KeyError:
        print(f"Eval could not retrieve col: {col}")
        return False

    # Use .equals to determine if the Series are the same
    if pred_col.equals(gold_col):
        if len(pred_col) == len(gold_col):
            print("EVAL TRUE")
            
        
            return True
        else:
            print("EVAL FALSE")
            return False
    else:

        print("EVAL FALSE")
        print("Type of pred_col", pred_col.dtype)
        print("Type of gold_col", gold_col.dtype)
        # number of rows that are different
        try:
            non_equal_elements = pred_col != gold_col
            print("Number of rows different", non_equal_elements.sum())
        except Exception as e:
            print(f"Error in comparison: {str(e)}")
            return False
        return False

In [12]:
def PM_Metric_training(example, prediction, trace = None, df=df):
    conn = sqlite3.connect("python_test.db")
    col = example.column
    question = example.instruction
    df_name = example.df_name
    gold_col = example.example
    
    query = 'SELECT * FROM event_log'
    try:
        dp = pd.read_sql_query(query, conn, parse_dates = ['time_timestamp'])

        if "duration" in dp.columns:
            dp['duration'] = pd.to_timedelta(dp['duration'], unit = "s")
    except Exception as e:
        print(f"Error executing SQL query: {str(e)}")
        return False
    for cols in dp.columns:
        if dp[cols].isin([0,1]).all() and not cols.endswith("_count"):
            dp[cols] = dp[cols].astype(bool)
    
    dp_sorted = dp.sort_values(by=['case_concept_name', 'time_timestamp'])

    cur = conn.cursor()
    

    df.to_sql("event_log", conn, if_exists="replace", index=False)
    cur = conn.cursor()
    cur.execute("CREATE INDEX idx_case_concept_name_event_log ON event_log(case_concept_name)")
    conn.commit()

    cur.execute("ALTER TABLE event_log ADD COLUMN idx INTEGER;")
    conn.commit()
    cur.execute("""UPDATE event_log SET idx = (
        SELECT rowid FROM (
            SELECT ROW_NUMBER() OVER (ORDER BY case_concept_name, time_timestamp) as seq_num, rowid
            FROM event_log
        ) temp WHERE temp.rowid = event_log.rowid
    );""")
    conn.commit()
    cur.execute("CREATE INDEX idx_event_log_idx ON event_log(idx);")
    conn.commit()
    cur.execute("VACUUM;")
    cur.execute("ANALYZE;")
    conn.commit()
    cur.close()
    
    # if gold_col.dtype == '<m8[ns]':
    #     gold_col = gold_col.dt.total_seconds().astype(int)

    try:
        pred_col = dp_sorted[col]
    except KeyError:
        print(f"Eval could not retrieve col: {col}")
        return False

    # Use .equals to determine if the Series are the same
    if pred_col.equals(gold_col):
        print("EVAL TRUE")
        return True
    print("EVAL FALSE")
    print("Type of pred_col", type(pred_col[0]))
    print("Type of gold_col", type(gold_col[0]))
    # number of rows that are different
    non_equal_elements = pred_col != gold_col
    print("Number of rows different", non_equal_elements.sum())
    return False

# Loading Dataset

In [13]:
gold = pd.read_csv("/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/benchmark/PM_EVALQUESTIONS_DF4.csv")
gold['time_timestamp'] = pd.to_datetime(gold['time_timestamp'])
gold['duration'] = pd.to_timedelta(gold['duration'], unit= "s")
gold_sorted = gold.sort_values(by=['case_concept_name', 'time_timestamp'])
gold_sorted = gold_sorted.reset_index(drop=True)

In [14]:
gold_sorted.groupby("case_concept_name")["paid_without_obligation"].value_counts()

case_concept_name  paid_without_obligation
A1                 False                      2
A100               False                      5
A10000             True                       5
A10001             False                      6
A10004             False                      5
                                             ..
V9995              False                      5
V9996              False                      3
V9997              False                      5
V9998              False                      5
V9999              False                      5
Name: count, Length: 150370, dtype: int64

In [15]:
gold_sorted.head()

Unnamed: 0,amount,org_resource,dismissal,concept_name,vehicleClass,totalPaymentAmount,lifecycle_transition,time_timestamp,article,points,...,underpaid,credit_collected_AND_fully_paid,dismissed_AND_fully_paid,overpaid_amount,underpaid_amount,part_paid,unresolved,paid_without_obligation,time_timestamp_beginn,time_timestamp_end
0,3500,561.0,NIL,Create Fine,A,0,complete,2006-07-24 00:00:00+00:00,157.0,0.0,...,True,False,False,0,4600,False,True,False,2006-07-24 00:00:00+00:00,2006-12-05 00:00:00+00:00
1,0,,,Send Fine,,0,complete,2006-12-05 00:00:00+00:00,,,...,True,False,False,0,4600,False,True,False,2006-07-24 00:00:00+00:00,2006-12-05 00:00:00+00:00
2,3500,561.0,NIL,Create Fine,A,0,complete,2006-08-02 00:00:00+00:00,157.0,0.0,...,True,False,False,0,8250,False,False,False,2006-08-02 00:00:00+00:00,2009-03-30 00:00:00+00:00
3,0,,,Send Fine,,0,complete,2006-12-12 00:00:00+00:00,,,...,True,False,False,0,8250,False,False,False,2006-08-02 00:00:00+00:00,2009-03-30 00:00:00+00:00
4,0,,,Insert Fine Notification,,0,complete,2007-01-15 00:00:00+00:00,,,...,True,False,False,0,8250,False,False,False,2006-08-02 00:00:00+00:00,2009-03-30 00:00:00+00:00


In [16]:
qa = pd.read_csv("/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/benchmark/PM_EVALQUESTIONS_final.csv")
qa.head()

Unnamed: 0,Definition,Column,Dependent On New Col,DF
0,"Create column called ""dismissed_by_prefecture""...",dismissed_by_prefecture,,1
1,"Create column called ""dismissed_by_judge"", def...",dismissed_by_judge,,1
2,"Create a column called ""maxtotalPaymentAmount""...",maxtotalPaymentAmount,,1
3,"Create a column called ""duration"" which takes ...",duration,,1
4,"Create a column called ""event_count"", type int...",event_count,,1


In [17]:
for i in range(len(qa)):
    if qa["Column"][i] == "obligation_topay_cancelled":
        print(i)
    elif qa["Column"][i] == "obligation_to_pay_cancelled":
        print(i)

10


In [18]:
qa["Column"][10]

'obligation_topay_cancelled'

In [19]:
dataset = []

for instruction, col, dep, df_name in qa.values:
    dataset.append(dspy.Example(instruction = instruction, column = col, df_name = df_name, example = gold_sorted[col]).with_inputs("instruction"))


In [20]:
# the following columns will be in the trainset: duration, event_count, amount_min, amount_last, obligation_topay_cancelled, dismissed_by_other, appeal_to_judgeorprefecture, add_penalty_count, send_fine_count, payment_count, insert_fine_notification_count, send_for_credit_collection_count, insert_date_appeal_to_prefecture_count, send_appeal_to_prefecture_count, receive_result_appeal_from_prefecture_count, notify_result_appeal_to_offender_count, time_timestamp_beginn, time_timestamp_end, appeal_to_judge_count

# use those columns for train set

trainset = []

cols_in_trainset = ['duration', 'event_count', 'amount_min', 'obligation_topay_cancelled', 'dismissed_by_other', 'appeal_to_judgeorprefecture', 'add_penalty_count', 'send_fine_count', 'payment_count', 'insert_fine_notification_count', 'send_for_credit_collection_count', 'insert_date_appeal_to_prefecture_count', 'send_appeal_to_prefecture_count', 'receive_result_appeal_from_prefecture_count', 'notify_result_appeal_to_offender_count', 'time_timestamp_beginn', 'time_timestamp_end', 'appeal_to_judge_count']

for i in range(len(dataset)):
    if dataset[i]["column"] in cols_in_trainset:
        trainset.append(dataset[i])

print(len(trainset))

# create test set which uses the remaining columns
trainset[0], trainset[3] = trainset[3], trainset[0]

testset = []

for i in range(len(dataset)):
    if dataset[i] not in trainset:
        testset.append(dataset[i])

print(len(testset))


18
25


# Evaluation

**'openai/gpt-4o', temperature=0.3, max_tokens=4096,**

- PM_PY_no_deep (with assertions): 68%, 13m NAME: PY_testset_uncompiled
- PM_PY_no_deep_fewshot_12 (with assertions): 100%, 10m 51s NAME: PY_testset_add_fewshot12_final
- PM_PY_no_deep_fewshot_12 (no assertions): 92%, 9m 50s NAME: PY_testset_add_fewshot12_no_assertion
- PM_PY_simple (with assertions): 28.0%, 29m 30s, NAME: PY_testset_simple (9/25 cols correct, descriptions wrong)
- PM_PY_simple (no assertions): 0%, 8m 30s, NAME: PY_testset_simple_no_assertion (6/25 cols correct, descriptions wrong)


**'openai/gpt-4o-mini', temperature=0.3, max_tokens=4096,**

- PM_PY_no_deep (with assertions): 36.0%, 22m 54s Name: PM_PY_testset_mini_uncompiled
- PM_PY_no_deep (no assertions): 0%, 5m, 7s (because the column descriptions could not be fed into the chroma retriever), in terms of cols (9/25 cols were correct) Name: PM_PY_testset_mini_uncompiled_no_assertion
- PM_PY_no_deep_fewshot_12 (with assertions): 92.0% 14m 29s, Name : PY_testset_mini_fewshot12
- PM_PY_no_deep_fewshot_12 (no assertions): 92.0% 10m 20s, Name: PY_testset_mini_fewshot12_no_assertion (score is super high, maybe run again)*
- PM_PY_simple (with assertions): 24.0%, 37m Name: PY_testset_mini_simple (a few descriptions were incorrect (10/25 cols ))
- PM_PY_simple (no assertions): 0.0%, 8m 52s, Name: PY_testset_mini_simple_no_assertion (9/25 col were correct, descriptions wrong)

for the evaluation on the trainset, we must make sure that the columns that require them are present. Therefore, we make two databases. One that is only the vanilla event log, and the other which includes the columns from the trainset.

In [128]:
uncompiled_py = assert_transform_module(PM_PY_no_deep(conn_path= SQLITE_DB_EVAL_NAME, rm=rm, training_mode=False ), functools.partial(backtrack_handler, max_backtracks=4))
#uncompiled_py.load("/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/Optimized_prompts/python/py_add_fewshot_12.json")
#uncompiled_py._compiled = True

In [137]:
uncompiled_py_no_assert = PM_PY_no_deep(conn_path= SQLITE_DB_EVAL_NAME, rm=rm, training_mode=False )
#compiled_no_assert.load("/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/Optimized_prompts/python/py_add_fewshot_12.json")
#compiled_no_assert._compiled = True

In [166]:
un_py_simple = assert_transform_module(PM_PY_simple(conn_path= SQLITE_DB_EVAL_NAME, rm=rm), functools.partial(backtrack_handler, max_backtracks=4))

In [175]:
un_py_simple_no_assert = PM_PY_simple(conn_path= SQLITE_DB_EVAL_NAME, rm=rm)

In [160]:
df_t = pm4py.read_xes(INPUT_FILE_NAME)
df_t.columns = df_t.columns.str.replace(':', '_', regex=False)
# replace nan values with 0 in columns "amount", "expense", "paymentAmount", "totalPaymentAmount"
df_t['amount'] = df_t['amount'].fillna(0)
df_t['expense'] = df_t['expense'].fillna(0)
df_t['paymentAmount'] = df_t['paymentAmount'].fillna(0)
df_t['totalPaymentAmount'] = df_t['totalPaymentAmount'].fillna(0)
#fturn euros into cents
df_t['amount'] = df_t['amount'] * 100
df_t['amount'] = df_t['amount'].astype(int)
df_t['expense'] = df_t['expense'] * 100
df_t['expense'] = df_t['expense'].astype(int)
df_t['paymentAmount'] = df_t['paymentAmount'] * 100
df_t['paymentAmount'] = df_t['paymentAmount'].astype(int)
df_t['totalPaymentAmount'] = df_t['totalPaymentAmount'] * 100
df_t['totalPaymentAmount'] = df_t['totalPaymentAmount'].astype(int)
df_t['obligation_topay_cancelled'] = gold_sorted['obligation_topay_cancelled']

conn = sqlite3.connect(SQLITE_DB_EVAL_NAME)
cur = conn.cursor()
cur.execute("DROP TABLE IF EXISTS temp_table;")

print(cur.fetchall())
conn.commit()

df_t.to_sql("event_log", conn, if_exists="replace", index=False)
cur = conn.cursor()
cur.execute("CREATE INDEX idx_case_concept_name_event_log ON event_log(case_concept_name)")
conn.commit()

cur.execute("ALTER TABLE event_log ADD COLUMN idx INTEGER;")
conn.commit()
cur.execute("""UPDATE event_log SET idx = (
    SELECT rowid FROM (
        SELECT ROW_NUMBER() OVER (ORDER BY case_concept_name, time_timestamp) as seq_num, rowid
        FROM event_log
    ) temp WHERE temp.rowid = event_log.rowid
);""")
conn.commit()
cur.execute("CREATE INDEX idx_event_log_idx ON event_log(idx);")
conn.commit()
cur.execute("VACUUM;")
cur.execute("ANALYZE;")
conn.commit()
cur.close()

parsing log, completed traces ::   0%|          | 0/150370 [00:00<?, ?it/s]

[]


In [37]:
len(testset)

25

In [57]:
testset[24:]

[Example({'instruction': 'Create a column called "paid_without_obligation" which is a boolean on a per case basis. TRUE if the column "obligation_topay_cancelled" == TRUE AND the column "fully_paid" == TRUE. If else "paid_without_obligation" == FALSE. Makse sure all values for "paid_without_obligation" are the same across all rows of a case.', 'column': 'paid_without_obligation', 'df_name': 4, 'example': 0         False
 1         False
 2         False
 3         False
 4         False
           ...  
 561465    False
 561466    False
 561467    False
 561468    False
 561469    False
 Name: paid_without_obligation, Length: 561470, dtype: bool}) (input_keys={'instruction'})]

In [176]:
evaluate = Evaluate(devset=testset, metric= PM_Metric, num_threads = 1, display_progress= True, display_table = len(testset), return_outputs=True, provide_traceback=True, max_errors=30)

In [177]:
score, outputs, scores = evaluate(program = un_py_simple_no_assert, return_all_scores= True)

  0%|          | 0/25 [00:00<?, ?it/s]length of dp 561470
column type of new column bool
pre_float_check True
pre_distinc_value_check True


2024/12/27 13:04:34 INFO dspy.primitives.assertions: SuggestionFailed: Column description must be short and less than 350 characters
2024/12/27 13:04:34 ERROR dspy.utils.parallelizer: Error processing item Example({'instruction': 'Create column called "dismissed_by_prefecture", defined as a boolean. True if any of the values of a case in column dismissal = "#" , if not False. Apply the same value across all rows of each case', 'column': 'dismissed_by_prefecture', 'df_name': 1, 'example': 0         False
1         False
2         False
3         False
4         False
          ...  
561465    False
561466    False
561467    False
561468    False
561469    False
Name: dismissed_by_prefecture, Length: 561470, dtype: bool}) (input_keys={'instruction'}): Column description must be short and less than 350 characters
Stack trace:
Traceback (most recent call last):
  File "/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/dspy_v2/lib/python3.12/site-packages/dspy/utils/parallelizer.py", line 47

Average Metric: 0.00 / 25 (0.0%):   4%|▍         | 1/25 [00:23<09:31, 23.82s/it]

2024/12/27 13:04:39 INFO dspy.primitives.assertions: SuggestionFailed: Error executing codename 'dp' is not defined
Stack trace:
Line 3: dp['dismissed_by_judge'] = dp.groupby('case_concept_name')['dismissal'].transform(lambda x: 'G' in x.values)
2024/12/27 13:04:39 ERROR dspy.utils.parallelizer: Error processing item Example({'instruction': 'Create column called "dismissed_by_judge", defined as a boolean. True if any of the values of a case in column dismissal = "G" , if not False. Apply the same value across all rows of each case', 'column': 'dismissed_by_judge', 'df_name': 1, 'example': 0         False
1         False
2         False
3         False
4         False
          ...  
561465    False
561466    False
561467    False
561468    False
561469    False
Name: dismissed_by_judge, Length: 561470, dtype: bool}) (input_keys={'instruction'}): Error executing codename 'dp' is not defined
Stack trace:
Line 3: dp['dismissed_by_judge'] = dp.groupby('case_concept_name')['dismissal'].tran

Average Metric: 0.00 / 25 (0.0%):   8%|▊         | 2/25 [00:28<04:52, 12.72s/it]

2024/12/27 13:04:40 INFO dspy.primitives.assertions: SuggestionFailed: Column type must be either INTEGER, BOOLEAN or DATETIME
2024/12/27 13:04:40 ERROR dspy.utils.parallelizer: Error processing item Example({'instruction': 'Create a column called "maxtotalPaymentAmount", type int, which for every case is the highest "totalPaymentAmount" value, if all the values of "totalPaymentAmount" are Nan or Null, simply use 0 instead. Make sure its the same value applied to each row of a case', 'column': 'maxtotalPaymentAmount', 'df_name': 1, 'example': 0         0
1         0
2         0
3         0
4         0
         ..
561465    0
561466    0
561467    0
561468    0
561469    0
Name: maxtotalPaymentAmount, Length: 561470, dtype: int64}) (input_keys={'instruction'}): Column type must be either INTEGER, BOOLEAN or DATETIME
Stack trace:
Traceback (most recent call last):
  File "/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/dspy_v2/lib/python3.12/site-packages/dspy/utils/parallelizer.py", li

Average Metric: 0.00 / 25 (0.0%):  12%|█▏        | 3/25 [00:29<02:43,  7.43s/it]

2024/12/27 13:04:41 INFO dspy.primitives.assertions: SuggestionFailed: Column type must be either INTEGER, BOOLEAN or DATETIME
2024/12/27 13:04:41 ERROR dspy.utils.parallelizer: Error processing item Example({'instruction': 'Create a column called "expense_sum", defined as type int, which sums all "expense" values per case, make sure this exepense_sum is the same for every row in a case', 'column': 'expense_sum', 'df_name': 1, 'example': 0         1100
1         1100
2         1100
3         1100
4         1100
          ... 
561465    1516
561466    1516
561467    1516
561468    1516
561469    1516
Name: expense_sum, Length: 561470, dtype: int64}) (input_keys={'instruction'}): Column type must be either INTEGER, BOOLEAN or DATETIME
Stack trace:
Traceback (most recent call last):
  File "/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/dspy_v2/lib/python3.12/site-packages/dspy/utils/parallelizer.py", line 47, in wrapped
    return function(item)
           ^^^^^^^^^^^^^^
  File "/Users

Average Metric: 0.00 / 25 (0.0%):  16%|█▌        | 4/25 [00:30<01:42,  4.86s/it]

2024/12/27 13:04:42 INFO dspy.primitives.assertions: SuggestionFailed: Column type must be either INTEGER, BOOLEAN or DATETIME
2024/12/27 13:04:42 ERROR dspy.utils.parallelizer: Error processing item Example({'instruction': 'Create a column called "amount_last", defined as type int, which takes the highest "amount" value per case, make sure "amount_last" is the same for every row in a case', 'column': 'amount_last', 'df_name': 1, 'example': 0          3500
1          3500
2          7150
3          7150
4          7150
          ...  
561465    26200
561466    26200
561467    26200
561468    26200
561469    26200
Name: amount_last, Length: 561470, dtype: int64}) (input_keys={'instruction'}): Column type must be either INTEGER, BOOLEAN or DATETIME
Stack trace:
Traceback (most recent call last):
  File "/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/dspy_v2/lib/python3.12/site-packages/dspy/utils/parallelizer.py", line 47, in wrapped
    return function(item)
           ^^^^^^^^^^^^^^


Average Metric: 0.00 / 25 (0.0%):  20%|██        | 5/25 [00:31<01:09,  3.49s/it]length of dp 561470
column type of new column bool
pre_float_check True
pre_distinc_value_check True


2024/12/27 13:05:06 INFO dspy.primitives.assertions: SuggestionFailed: Column description must be short and less than 350 characters
2024/12/27 13:05:06 ERROR dspy.utils.parallelizer: Error processing item Example({'instruction': 'Create a column called "dismissed" which is a boolean, TRUE if any of the values in a case of "dismissal" contain either  "G" or "#", if not False. Apply the same value across all rows of each case', 'column': 'dismissed', 'df_name': 1, 'example': 0         False
1         False
2         False
3         False
4         False
          ...  
561465    False
561466    False
561467    False
561468    False
561469    False
Name: dismissed, Length: 561470, dtype: bool}) (input_keys={'instruction'}): Column description must be short and less than 350 characters
Stack trace:
Traceback (most recent call last):
  File "/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/dspy_v2/lib/python3.12/site-packages/dspy/utils/parallelizer.py", line 47, in wrapped
    return func

Average Metric: 0.00 / 25 (0.0%):  24%|██▍       | 6/25 [00:55<03:18, 10.46s/it]length of dp 561470
column type of new column bool
pre_float_check True
pre_distinc_value_check True


2024/12/27 13:05:38 INFO dspy.primitives.assertions: SuggestionFailed: Column description must be short and less than 350 characters
2024/12/27 13:05:38 ERROR dspy.utils.parallelizer: Error processing item Example({'instruction': 'Create a column called "credit_collected" which is a boolean. True if any of the values in a case in the column "concept_name" contain "Send for Credit Collection", if not False. Apply the same value across all rows of each case.', 'column': 'credit_collected', 'df_name': 1, 'example': 0         False
1         False
2          True
3          True
4          True
          ...  
561465     True
561466     True
561467     True
561468     True
561469     True
Name: credit_collected, Length: 561470, dtype: bool}) (input_keys={'instruction'}): Column description must be short and less than 350 characters
Stack trace:
Traceback (most recent call last):
  File "/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/dspy_v2/lib/python3.12/site-packages/dspy/utils/paralle

Average Metric: 0.00 / 25 (0.0%):  28%|██▊       | 7/25 [01:27<05:10, 17.25s/it]length of dp 561470
column type of new column bool
pre_float_check True
pre_distinc_value_check True


2024/12/27 13:06:13 INFO dspy.primitives.assertions: SuggestionFailed: Column description must be short and less than 350 characters
2024/12/27 13:06:13 ERROR dspy.utils.parallelizer: Error processing item Example({'instruction': 'Create a column called "penalty_added" which is a boolean, TRUE if any of the values in a case in the column "concept_name"contain "Add penalty", if not False. Apply the same value across all rows of each case.', 'column': 'penalty_added', 'df_name': 1, 'example': 0         False
1         False
2          True
3          True
4          True
          ...  
561465     True
561466     True
561467     True
561468     True
561469     True
Name: penalty_added, Length: 561470, dtype: bool}) (input_keys={'instruction'}): Column description must be short and less than 350 characters
Stack trace:
Traceback (most recent call last):
  File "/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/dspy_v2/lib/python3.12/site-packages/dspy/utils/parallelizer.py", line 47, in wr

Average Metric: 0.00 / 25 (0.0%):  32%|███▏      | 8/25 [02:02<06:33, 23.14s/it]length of dp 561470
column type of new column bool
pre_float_check True
pre_distinc_value_check True


2024/12/27 13:06:35 INFO dspy.primitives.assertions: SuggestionFailed: Column description must be short and less than 350 characters
2024/12/27 13:06:35 ERROR dspy.utils.parallelizer: Error processing item Example({'instruction': 'Create a column called "appealed_to_judge" which is a boolean. TRUE if any of the values in a case in the column "concept_name" of a case contains "Appeal to Judge". False if else. Make sure "appealed_to_judge" is the same value across all rows of a case.', 'column': 'appealed_to_judge', 'df_name': 1, 'example': 0         False
1         False
2         False
3         False
4         False
          ...  
561465    False
561466    False
561467    False
561468    False
561469    False
Name: appealed_to_judge, Length: 561470, dtype: bool}) (input_keys={'instruction'}): Column description must be short and less than 350 characters
Stack trace:
Traceback (most recent call last):
  File "/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/dspy_v2/lib/python3.12/site

Average Metric: 0.00 / 25 (0.0%):  36%|███▌      | 9/25 [02:24<06:03, 22.74s/it]

2024/12/27 13:06:40 INFO dspy.primitives.assertions: SuggestionFailed: Error executing codename 'dp' is not defined
Stack trace:
Line 3: dp['appealed_to_prefecture'] = dp.groupby('case_concept_name')['concept_name'].transform(lambda x: 'Insert Date Appeal to Prefecture' in x.values).astype(bool)
2024/12/27 13:06:40 ERROR dspy.utils.parallelizer: Error processing item Example({'instruction': 'Create a column called "appealed_to_prefecture" which is a boolean. TRUE if any of the values in a case in the column "concept_name" of a case contains "Insert Date Appeal to Prefecture". False if else. Make sure "appealed_to_judge" is the same value across all rows of a case.', 'column': 'appealed_to_prefecture', 'df_name': 1, 'example': 0         False
1         False
2         False
3         False
4         False
          ...  
561465    False
561466    False
561467    False
561468    False
561469    False
Name: appealed_to_prefecture, Length: 561470, dtype: bool}) (input_keys={'instruction'})

Average Metric: 0.00 / 25 (0.0%):  40%|████      | 10/25 [02:29<04:16, 17.12s/it]

2024/12/27 13:06:41 INFO dspy.primitives.assertions: SuggestionFailed: Column type must be either INTEGER, BOOLEAN or DATETIME
2024/12/27 13:06:41 ERROR dspy.utils.parallelizer: Error processing item Example({'instruction': 'Create a column called "outstanding_balance", as type int, which is defined as "amount_last" + "expense_sum" - "maxtotalPaymentAmount" for only the last row/event per case (i.e only consider the last row for each case during the calculation), make sure "outstanding_balance" is the same for every row in a case.', 'column': 'outstanding_balance', 'df_name': 2, 'example': 0          4600
1          4600
2          8250
3          8250
4          8250
          ...  
561465    27716
561466    27716
561467    27716
561468    27716
561469    27716
Name: outstanding_balance, Length: 561470, dtype: int64}) (input_keys={'instruction'}): Column type must be either INTEGER, BOOLEAN or DATETIME
Stack trace:
Traceback (most recent call last):
  File "/Users/sulzair/Documents/Ba

Average Metric: 0.00 / 25 (0.0%):  44%|████▍     | 11/25 [02:30<02:51, 12.28s/it]

2024/12/27 13:06:56 INFO dspy.primitives.assertions: SuggestionFailed: Error executing code'<' not supported between instances of 'int' and 'str'
Stack trace:
Line 18: ).reindex(dp.index, method='ffill')
2024/12/27 13:06:56 ERROR dspy.utils.parallelizer: Error processing item Example({'instruction': 'Create a column called "credit_collected_AND_dismissed" which is a boolean. Defined as TRUE if per case any of the rows are TRUE in the column "dismissed" and TRUE for the column "credit_collected". If Else False. Make sure the value for "credit_collected_AND_dismissed" is the same across all rows of a case. ', 'column': 'credit_collected_AND_dismissed', 'df_name': 2, 'example': 0         False
1         False
2         False
3         False
4         False
          ...  
561465    False
561466    False
561467    False
561468    False
561469    False
Name: credit_collected_AND_dismissed, Length: 561470, dtype: bool}) (input_keys={'instruction'}): Error executing code'<' not supported betw

Average Metric: 0.00 / 25 (0.0%):  48%|████▊     | 12/25 [02:45<02:48, 12.99s/it]length of dp 561470
column type of new column bool
pre_float_check True
pre_distinc_value_check True


2024/12/27 13:07:18 INFO dspy.primitives.assertions: SuggestionFailed: Column description must be short and less than 350 characters
2024/12/27 13:07:18 ERROR dspy.utils.parallelizer: Error processing item Example({'instruction': 'Create a column called "paid_nothing" which is a boolean. Defined as TRUE if and only if per case the column "maxotalPaymentAmount" is smaller equal to 0. Make sure the value for "paid_nothing" is the same across all rows of a case.', 'column': 'paid_nothing', 'df_name': 2, 'example': 0         True
1         True
2         True
3         True
4         True
          ... 
561465    True
561466    True
561467    True
561468    True
561469    True
Name: paid_nothing, Length: 561470, dtype: bool}) (input_keys={'instruction'}): Column description must be short and less than 350 characters
Stack trace:
Traceback (most recent call last):
  File "/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/dspy_v2/lib/python3.12/site-packages/dspy/utils/parallelizer.py", line 

Average Metric: 0.00 / 25 (0.0%):  52%|█████▏    | 13/25 [03:07<03:09, 15.81s/it]

2024/12/27 13:07:25 INFO dspy.primitives.assertions: SuggestionFailed: Error executing codename 'dp' is not defined
Stack trace:
Line 3: dp['appealed_to_judge'] = dp.groupby('case_concept_name')['concept_name'].transform(lambda x: 'Appeal to Judge' in x.values)
2024/12/27 13:07:25 ERROR dspy.utils.parallelizer: Error processing item Example({'instruction': 'Create a column called "appeal_judge_cancelled" which is a boolean on a per case basis. TRUE if the column "appealed_to_judge" == TRUE AND the column "dismissed_by_judge" == FALSE, if else "appeal_judge_cancelled" == FALSE. Make sure all values of "appeal_judge_cancelled" are the same across all rows of a case.', 'column': 'appeal_judge_cancelled', 'df_name': 2, 'example': 0         False
1         False
2         False
3         False
4         False
          ...  
561465    False
561466    False
561467    False
561468    False
561469    False
Name: appeal_judge_cancelled, Length: 561470, dtype: bool}) (input_keys={'instruction'})

Average Metric: 0.00 / 25 (0.0%):  56%|█████▌    | 14/25 [03:14<02:23, 13.06s/it]

2024/12/27 13:07:29 INFO dspy.primitives.assertions: SuggestionFailed: Error executing codename 'dp' is not defined
Stack trace:
Line 3: dp['appealed_to_prefecture'] = dp.groupby('case_concept_name')['concept_name'].transform(lambda x: 'Send Appeal to Prefecture' in x.values)
2024/12/27 13:07:29 ERROR dspy.utils.parallelizer: Error processing item Example({'instruction': 'Create a column called "appeal_prefecture_cancelled" which is a boolean on a per case basis. TRUE if the column "appealed_to_prefecture" == TRUE AND the column "dismissed_by_prefecture" == FALSE, if else "appeal_prefecture_cancelled" == FALSE. Make sure all values of "appeal_prefecture_cancelled" are the same across all rows of a case.', 'column': 'appeal_prefecture_cancelled', 'df_name': 2, 'example': 0         False
1         False
2         False
3         False
4         False
          ...  
561465    False
561466    False
561467    False
561468    False
561469    False
Name: appeal_prefecture_cancelled, Length: 

Average Metric: 0.00 / 25 (0.0%):  60%|██████    | 15/25 [03:18<01:44, 10.47s/it]length of dp 561470
column type of new column bool
pre_float_check True
pre_distinc_value_check True


2024/12/27 13:07:53 INFO dspy.primitives.assertions: SuggestionFailed: Column description must be short and less than 350 characters
2024/12/27 13:07:53 ERROR dspy.utils.parallelizer: Error processing item Example({'instruction': 'Create a column called "fully_paid" which is a boolean, TRUE if "outstanding_balance" <= 0, if else FALSE, for every case in the event log, make sure "fully_paid" is the same for every row in a case', 'column': 'fully_paid', 'df_name': 3, 'example': 0         False
1         False
2         False
3         False
4         False
          ...  
561465    False
561466    False
561467    False
561468    False
561469    False
Name: fully_paid, Length: 561470, dtype: bool}) (input_keys={'instruction'}): Column description must be short and less than 350 characters
Stack trace:
Traceback (most recent call last):
  File "/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/dspy_v2/lib/python3.12/site-packages/dspy/utils/parallelizer.py", line 47, in wrapped
    return f

Average Metric: 0.00 / 25 (0.0%):  64%|██████▍   | 16/25 [03:42<02:11, 14.56s/it]length of dp 561470
column type of new column bool
pre_float_check True
pre_distinc_value_check True


2024/12/27 13:08:27 INFO dspy.primitives.assertions: SuggestionFailed: Column description must be short and less than 350 characters
2024/12/27 13:08:27 ERROR dspy.utils.parallelizer: Error processing item Example({'instruction': 'Create a column called "overpaid" which is a boolean. TRUE if on a per case basis, the column "outstanding_balance" is smaller than 0. False if else. Make sure the value for "overpaid" is the same across all rows of a case.', 'column': 'overpaid', 'df_name': 3, 'example': 0         False
1         False
2         False
3         False
4         False
          ...  
561465    False
561466    False
561467    False
561468    False
561469    False
Name: overpaid, Length: 561470, dtype: bool}) (input_keys={'instruction'}): Column description must be short and less than 350 characters
Stack trace:
Traceback (most recent call last):
  File "/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/dspy_v2/lib/python3.12/site-packages/dspy/utils/parallelizer.py", line 47, in

Average Metric: 0.00 / 25 (0.0%):  68%|██████▊   | 17/25 [04:16<02:42, 20.25s/it]length of dp 561470
column type of new column bool
pre_float_check True
pre_distinc_value_check True


2024/12/27 13:09:17 INFO dspy.primitives.assertions: SuggestionFailed: Column description must be short and less than 350 characters
2024/12/27 13:09:17 ERROR dspy.utils.parallelizer: Error processing item Example({'instruction': 'Create a column called "underpaid" which is a boolean. TRUE if on a per case basis, the column "outstanding_balance" is larger than 0. False if else. Make sure the value for "underpaid" is the same across all rows of a case.', 'column': 'underpaid', 'df_name': 3, 'example': 0         True
1         True
2         True
3         True
4         True
          ... 
561465    True
561466    True
561467    True
561468    True
561469    True
Name: underpaid, Length: 561470, dtype: bool}) (input_keys={'instruction'}): Column description must be short and less than 350 characters
Stack trace:
Traceback (most recent call last):
  File "/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/dspy_v2/lib/python3.12/site-packages/dspy/utils/parallelizer.py", line 47, in wrapped

Average Metric: 0.00 / 25 (0.0%):  72%|███████▏  | 18/25 [05:06<03:24, 29.21s/it]

2024/12/27 13:09:38 INFO dspy.primitives.assertions: SuggestionFailed: Error executing code'<' not supported between instances of 'int' and 'str'
Stack trace:
Line 18: ).reindex(dp.index, method='ffill')
2024/12/27 13:09:38 ERROR dspy.utils.parallelizer: Error processing item Example({'instruction': 'Create a column called "credit_collected_AND_fully_paid" which is a boolean. Defined as TRUE if per case any of the rows are TRUE in the column "fully_paid" and TRUE for the column "credit_collected". If Else False. Make sure the value for "credit_collected_AND_fully_paid" is the same across all rows of a case. ', 'column': 'credit_collected_AND_fully_paid', 'df_name': 4, 'example': 0         False
1         False
2         False
3         False
4         False
          ...  
561465    False
561466    False
561467    False
561468    False
561469    False
Name: credit_collected_AND_fully_paid, Length: 561470, dtype: bool}) (input_keys={'instruction'}): Error executing code'<' not supported

Average Metric: 0.00 / 25 (0.0%):  76%|███████▌  | 19/25 [05:27<02:41, 26.92s/it]length of dp 561470
column type of new column bool
pre_float_check True
pre_distinc_value_check True


2024/12/27 13:10:25 INFO dspy.primitives.assertions: SuggestionFailed: Column description must be short and less than 350 characters
2024/12/27 13:10:25 ERROR dspy.utils.parallelizer: Error processing item Example({'instruction': 'Create a column called "dismissed_AND_fully_paid" which is a boolean. Defined as TRUE if per case any of the rows are TRUE in the column "dismissed" and TRUE for the column "fully_paid". If Else False. Make sure the value for "dismissed_AND_fully_paid" is the same across all rows of a case. ', 'column': 'dismissed_AND_fully_paid', 'df_name': 4, 'example': 0         False
1         False
2         False
3         False
4         False
          ...  
561465    False
561466    False
561467    False
561468    False
561469    False
Name: dismissed_AND_fully_paid, Length: 561470, dtype: bool}) (input_keys={'instruction'}): Column description must be short and less than 350 characters
Stack trace:
Traceback (most recent call last):
  File "/Users/sulzair/Documents/

Average Metric: 0.00 / 25 (0.0%):  80%|████████  | 20/25 [06:14<02:44, 32.99s/it]

2024/12/27 13:10:33 INFO dspy.primitives.assertions: SuggestionFailed: Error executing codename 'dp' is not defined
Stack trace:
Line 3: dp['overpaid'] = (dp['totalPaymentAmount'] > (dp['amount'] + dp['expense'])).astype(bool)
2024/12/27 13:10:33 ERROR dspy.utils.parallelizer: Error processing item Example({'instruction': 'Create a column called "overpaid_amount" which is an integer. Defined as 0 if the column "overpaid" == FALSE. If "overpaid" == TURE it is the difference between 0 and the column "outstanding_balance" (take the absolute value to avoid negative values. Make sure the value for "overpaid_amount" is the same across all rows of a case.', 'column': 'overpaid_amount', 'df_name': 4, 'example': 0         0
1         0
2         0
3         0
4         0
         ..
561465    0
561466    0
561467    0
561468    0
561469    0
Name: overpaid_amount, Length: 561470, dtype: int64}) (input_keys={'instruction'}): Error executing codename 'dp' is not defined
Stack trace:
Line 3: dp['o

Average Metric: 0.00 / 25 (0.0%):  84%|████████▍ | 21/25 [06:22<01:41, 25.47s/it]length of dp 561470
column type of new column int64
pre_float_check True
pre_distinc_value_check True


2024/12/27 13:11:02 INFO dspy.primitives.assertions: SuggestionFailed: Column description must be short and less than 350 characters
2024/12/27 13:11:02 ERROR dspy.utils.parallelizer: Error processing item Example({'instruction': 'Create a column called "underpaid_amount" which is a integer. Defined as 0 if the column "underpaid" == FALSE. If "underpaid" == TURE it is equal to the value of the column "outstanding_balance". Make sure the value for "underpaid_amount" is the same across all rows of a case.', 'column': 'underpaid_amount', 'df_name': 4, 'example': 0          4600
1          4600
2          8250
3          8250
4          8250
          ...  
561465    27716
561466    27716
561467    27716
561468    27716
561469    27716
Name: underpaid_amount, Length: 561470, dtype: int64}) (input_keys={'instruction'}): Column description must be short and less than 350 characters
Stack trace:
Traceback (most recent call last):
  File "/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/dspy_v

Average Metric: 0.00 / 25 (0.0%):  88%|████████▊ | 22/25 [06:51<01:19, 26.47s/it]length of dp 561470
column type of new column bool
pre_float_check True
pre_distinc_value_check True


2024/12/27 13:11:29 INFO dspy.primitives.assertions: SuggestionFailed: Column description must be short and less than 350 characters
2024/12/27 13:11:29 ERROR dspy.utils.parallelizer: Error processing item Example({'instruction': 'Create a column called "part_paid" which is a boolean. TRUE if on a per case basis, the column "fully_paid" AND "paid_nothing"are equal to FALSE. If Either "fully_paid" OR "paid_nothing" == TRUE, "part_paid" == FALSE. Make sure the value for "part_paid" is the same across all rows of a case. ', 'column': 'part_paid', 'df_name': 4, 'example': 0         False
1         False
2         False
3         False
4         False
          ...  
561465    False
561466    False
561467    False
561468    False
561469    False
Name: part_paid, Length: 561470, dtype: bool}) (input_keys={'instruction'}): Column description must be short and less than 350 characters
Stack trace:
Traceback (most recent call last):
  File "/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/dspy_

Average Metric: 0.00 / 25 (0.0%):  92%|█████████▏| 23/25 [07:18<00:53, 26.61s/it]length of dp 561470
column type of new column bool
pre_float_check True
pre_distinc_value_check True


2024/12/27 13:12:07 INFO dspy.primitives.assertions: SuggestionFailed: Column description must be short and less than 350 characters
2024/12/27 13:12:07 ERROR dspy.utils.parallelizer: Error processing item Example({'instruction': 'Create column called "unresolved" which is a boolean. TRUE if on a per case basis the column "fully_paid" AND "credit_collected" AND "dismissed" are == FALSE, if any of those columns are TRUE, "unresolved" == FALSE. Makse sure the value for "unresolved" is the same across all rows of a case.', 'column': 'unresolved', 'df_name': 4, 'example': 0          True
1          True
2         False
3         False
4         False
          ...  
561465    False
561466    False
561467    False
561468    False
561469    False
Name: unresolved, Length: 561470, dtype: bool}) (input_keys={'instruction'}): Column description must be short and less than 350 characters
Stack trace:
Traceback (most recent call last):
  File "/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/dspy

Average Metric: 0.00 / 25 (0.0%):  96%|█████████▌| 24/25 [07:56<00:29, 29.96s/it]

2024/12/27 13:12:42 INFO dspy.primitives.assertions: SuggestionFailed: Do not use .apply and lambda functions to create the new column, as those will result in nan values when grouping by case (subsequent column dtype will be float64 or object). Instead use boolean operators first and subsequently group by case, then counring using for example value_counts() or methods that are more robust in avoiding nan values. (fillna(0).astype(int) will most likely also be incorrect.)
2024/12/27 13:12:42 ERROR dspy.utils.parallelizer: Error processing item Example({'instruction': 'Create a column called "paid_without_obligation" which is a boolean on a per case basis. TRUE if the column "obligation_topay_cancelled" == TRUE AND the column "fully_paid" == TRUE. If else "paid_without_obligation" == FALSE. Makse sure all values for "paid_without_obligation" are the same across all rows of a case.', 'column': 'paid_without_obligation', 'df_name': 4, 'example': 0         False
1         False
2         F

length of dp 561470
column type of new column object
pre_float_check False
Average Metric: 0.00 / 25 (0.0%): 100%|██████████| 25/25 [08:31<00:00, 20.46s/it]

2024/12/27 13:12:42 INFO dspy.evaluate.evaluate: Average Metric: 0.0 / 25 (0.0%)





Unnamed: 0,instruction,column,df_name,example,PM_Metric
0,"Create column called ""dismissed_by_prefecture"", defined as a boole...",dismissed_by_prefecture,1,0 False 1 False 2 False 3 False 4 False ... 561465 False 561466 Fa...,
1,"Create column called ""dismissed_by_judge"", defined as a boolean. T...",dismissed_by_judge,1,0 False 1 False 2 False 3 False 4 False ... 561465 False 561466 Fa...,
2,"Create a column called ""maxtotalPaymentAmount"", type int, which fo...",maxtotalPaymentAmount,1,0 0 1 0 2 0 3 0 4 0 .. 561465 0 561466 0 561467 0 561468 0 561469 ...,
3,"Create a column called ""expense_sum"", defined as type int, which s...",expense_sum,1,0 1100 1 1100 2 1100 3 1100 4 1100 ... 561465 1516 561466 1516 561...,
4,"Create a column called ""amount_last"", defined as type int, which t...",amount_last,1,0 3500 1 3500 2 7150 3 7150 4 7150 ... 561465 26200 561466 26200 5...,
5,"Create a column called ""dismissed"" which is a boolean, TRUE if any...",dismissed,1,0 False 1 False 2 False 3 False 4 False ... 561465 False 561466 Fa...,
6,"Create a column called ""credit_collected"" which is a boolean. True...",credit_collected,1,0 False 1 False 2 True 3 True 4 True ... 561465 True 561466 True 5...,
7,"Create a column called ""penalty_added"" which is a boolean, TRUE if...",penalty_added,1,0 False 1 False 2 True 3 True 4 True ... 561465 True 561466 True 5...,
8,"Create a column called ""appealed_to_judge"" which is a boolean. TRU...",appealed_to_judge,1,0 False 1 False 2 False 3 False 4 False ... 561465 False 561466 Fa...,
9,"Create a column called ""appealed_to_prefecture"" which is a boolean...",appealed_to_prefecture,1,0 False 1 False 2 False 3 False 4 False ... 561465 False 561466 Fa...,


In [178]:
df_merged = save_report_PM(outputs,scores,"/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/Results_PY/PY_testset_simple_no_assertion", un_py_simple_no_assert, conn = conn)

In [170]:
# reset DB and class for another evaluation

conn = sqlite3.connect(SQLITE_DB_EVAL_NAME)
cur = conn.cursor()
cur.execute("DROP TABLE IF EXISTS temp_table;")

print(cur.fetchall())
conn.commit()

df_t.to_sql("event_log", conn, if_exists="replace", index=False)
cur = conn.cursor()
cur.execute("CREATE INDEX idx_case_concept_name_event_log ON event_log(case_concept_name)")
conn.commit()

cur.execute("ALTER TABLE event_log ADD COLUMN idx INTEGER;")
conn.commit()
cur.execute("""UPDATE event_log SET idx = (
    SELECT rowid FROM (
        SELECT ROW_NUMBER() OVER (ORDER BY case_concept_name, time_timestamp) as seq_num, rowid
        FROM event_log
    ) temp WHERE temp.rowid = event_log.rowid
);""")
conn.commit()
cur.execute("CREATE INDEX idx_event_log_idx ON event_log(idx);")
conn.commit()
cur.execute("VACUUM;")
cur.execute("ANALYZE;")
conn.commit()
cur.close()

[]


# Optimization 

In [25]:
py_special = PM_PY_no_deep(conn_path = SQLITE_DB_NAME, rm=rm, training_mode=True).activate_assertions()
py_special.load("/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/Optimized_prompts/python/py_fewshot_12 copy 2.json")
py_special._compiled = False

In [26]:
uncompiled_py = PM_PY_no_deep(conn_path = SQLITE_DB_NAME, rm=rm).activate_assertions()

In [26]:
# reset DB and class for another evaluation

conn = sqlite3.connect(SQLITE_DB_NAME)
cur = conn.cursor()
cur.execute("DROP TABLE IF EXISTS temp_table;")

print(cur.fetchall())
conn.commit()

df.to_sql("event_log", conn, if_exists="replace", index=False)
cur = conn.cursor()
cur.execute("CREATE INDEX idx_case_concept_name_event_log ON event_log(case_concept_name)")
conn.commit()

cur.execute("ALTER TABLE event_log ADD COLUMN idx INTEGER;")
conn.commit()
cur.execute("""UPDATE event_log SET idx = (
    SELECT rowid FROM (
        SELECT ROW_NUMBER() OVER (ORDER BY case_concept_name, time_timestamp) as seq_num, rowid
        FROM event_log
    ) temp WHERE temp.rowid = event_log.rowid
);""")
conn.commit()
cur.execute("CREATE INDEX idx_event_log_idx ON event_log(idx);")
conn.commit()
cur.execute("VACUUM;")
cur.execute("ANALYZE;")
conn.commit()
cur.close()

[]


In [27]:
len(trainset)

18

In [28]:
bootstrap_fewshot = BootstrapFewShot(
    metric = PM_Metric_training,
    max_bootstrapped_demos = 3,
    max_labeled_demos = 0,
    max_rounds = 3,
    max_errors = 2)
py_compiled_special = bootstrap_fewshot.compile(student = py_special, trainset = trainset)

  0%|          | 0/18 [00:00<?, ?it/s]2024/12/26 13:48:36 INFO dspy.primitives.assertions: SuggestionFailed: Error executing codename 'dp' is not defined
Stack trace:
Line 3: case_durations = dp[dp['concept_name'].isin(['Create Fine', 'Send Fine'])]
2024/12/26 13:48:45 INFO dspy.primitives.assertions: SuggestionFailed: Error executing codesingle positional indexer is out-of-bounds
Stack trace:
Line 14: timedeltas = case_durations.groupby('case_concept_name').apply(
Line 15:     lambda x: (x[x['concept_name'] == 'Send Fine']['time_timestamp'].iloc[0] -
2024/12/26 13:49:21 INFO dspy.primitives.assertions: SuggestionFailed: Error executing codename 'timedeltas' is not defined
Stack trace:
Line 24: dp['obligation_topay_cancelled'] = dp['case_concept_name'].map(lambda x: timedeltas.get(x, 0) > 90)
Line 24: dp['obligation_topay_cancelled'] = dp['case_concept_name'].map(lambda x: timedeltas.get(x, 0) > 90)
2024/12/26 13:49:22 INFO dspy.primitives.assertions: SuggestionFailed: New Column was n

length of dp 561470
pre_float_check True
pre_distinc_value_check True
Eval could not retrieve col: obligation_topay_cancelled


2024/12/26 13:49:38 INFO dspy.primitives.assertions: SuggestionFailed: Error executing codename 'dp' is not defined
Stack trace:
Line 3: case_durations = dp[dp['concept_name'].isin(['Create Fine', 'Send Fine'])].groupby(['case_concept_name', 'concept_name'])['time_timestamp'].min().unstack()


length of dp 561470
column type of new column bool
pre_float_check True
pre_distinc_value_check True


  6%|▌         | 1/18 [01:27<24:53, 87.84s/it]

EVAL TRUE


2024/12/26 13:50:03 INFO dspy.primitives.assertions: SuggestionFailed: Error executing codename 'dp' is not defined
Stack trace:
Line 3: case_event_counts = dp.groupby('case_concept_name').size()


length of dp 561470
column type of new column int64
pre_float_check True
pre_distinc_value_check True


 11%|█         | 2/18 [01:49<13:06, 49.15s/it]

EVAL TRUE


2024/12/26 13:50:26 INFO dspy.primitives.assertions: SuggestionFailed: Error executing codename 'dp' is not defined
Stack trace:
Line 3: amount_min_per_case = dp[dp['amount'] > 0].groupby('case_concept_name')['amount'].min().reset_index()


length of dp 561470
column type of new column int64
pre_float_check True
pre_distinc_value_check True


 17%|█▋        | 3/18 [02:14<11:11, 44.75s/it]

EVAL TRUE
Bootstrapped 3 full traces after 3 examples for up to 3 rounds, amounting to 4 attempts.





In [29]:
py_compiled_special.save("/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/Optimized_prompts/python/py_add_fewshot_3.json")

In [35]:
py_compiled_special_2.training_mode

False

In [31]:
bootstrap_fewshot = BootstrapFewShot(
    metric = PM_Metric_training,
    max_bootstrapped_demos = 12,
    max_labeled_demos = 0,
    max_rounds = 3,
    max_errors = 3)
py_compiled_special_2 = bootstrap_fewshot.compile(student = py_special, teacher=py_compiled_special, trainset = trainset)

  0%|          | 0/18 [00:00<?, ?it/s]

length of dp 561470
column type of new column bool
pre_float_check True
pre_distinc_value_check True


  6%|▌         | 1/18 [00:18<05:20, 18.86s/it]

EVAL TRUE
length of dp 561470
column type of new column int64
pre_float_check True
pre_distinc_value_check True


 11%|█         | 2/18 [00:36<04:47, 17.99s/it]

EVAL TRUE
length of dp 561470
column type of new column int64
pre_float_check True
pre_distinc_value_check True


 17%|█▋        | 3/18 [00:54<04:34, 18.33s/it]

EVAL TRUE
length of dp 561470
column type of new column int64
pre_float_check True
pre_distinc_value_check True


 22%|██▏       | 4/18 [01:13<04:15, 18.23s/it]

EVAL TRUE
length of dp 561470
column type of new column bool
pre_float_check True
pre_distinc_value_check True


 28%|██▊       | 5/18 [01:32<04:00, 18.51s/it]

EVAL TRUE
length of dp 561470
column type of new column bool
pre_float_check True
pre_distinc_value_check True


 33%|███▎      | 6/18 [01:56<04:05, 20.47s/it]

EVAL TRUE
length of dp 561470
column type of new column int64
pre_float_check True
pre_distinc_value_check True


 39%|███▉      | 7/18 [02:14<03:36, 19.66s/it]

EVAL TRUE
length of dp 561470
column type of new column int64
pre_float_check True
pre_distinc_value_check True


 44%|████▍     | 8/18 [02:32<03:10, 19.09s/it]

EVAL TRUE
length of dp 561470
column type of new column int64
pre_float_check True
pre_distinc_value_check True


 50%|█████     | 9/18 [02:50<02:48, 18.75s/it]

EVAL TRUE
length of dp 561470
column type of new column int64
pre_float_check True
pre_distinc_value_check True


 56%|█████▌    | 10/18 [03:08<02:29, 18.67s/it]

EVAL TRUE
length of dp 561470
column type of new column int64
pre_float_check True
pre_distinc_value_check True


 61%|██████    | 11/18 [03:26<02:09, 18.47s/it]

EVAL TRUE
length of dp 561470
column type of new column int64
pre_float_check True
pre_distinc_value_check True


 67%|██████▋   | 12/18 [03:44<01:52, 18.74s/it]

EVAL TRUE
Bootstrapped 12 full traces after 12 examples for up to 3 rounds, amounting to 12 attempts.





In [32]:
py_compiled_special_2.save("/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/Optimized_prompts/python/py_add_fewshot_12.json")