In [16]:
import os
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()
print(os.environ["OPENAI_API_KEY"])
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

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


In [11]:
import json

SYSTEM = """\
You are an expert grader that never makes mistakes. 

### Question Type
You will be given a step-by-step SOLUTION for a QUESTION that asks for the marginal distribution of some random variable $v_i$ under an intervention on some other variable $v_j$ ($i$ is allowed to equal $j$). 

### Catching Errors
You will be given the definition of an ERROR. Your task is to determine whether the SOLUTION made that ERROR. Justify your decision by pointing out EVIDENCE - locations in SOLUTION where the ERROR happens, and provide a short EXPLANATION.

### Formatting Response
Put your EVIDENCE, if any, within the <evidence></evidence> tag, then your EXPLANATION, if any, within the <explanation></explanation> tag, and finally put your judgement (choose from "Yes" or "No") in <judgement></judgement> tag, where Yes means it contains the ERROR defined above, and No means it does not contain the ERROR defined above. Note that a SOLUTION can contain multiple kinds of errors, please focus on judging only the ERROR that is defined above.
"""

USER = """\
### QUESTION:
<question>
{question}
</question>

### SOLUTION:
<student_solution>
{answer}
</student_solution>

### ERROR definition:
<error_definition>
{error}
</error_definition>

Does the SOLUTION make the ERROR defined above?"""

MODELING_ERROR = """\
Modeling Error
* This category of errors focuses on mistakes that happened during the derivation of NEW probability distributions in SOLUTION. NEW distributions includes any distribution that is computed in SOLUTIOn but not PROVIDED by QUESTION. For example, the target distribution p($v_i$), is often a NEW probability distribution (which is rarely provided in QUESTION). The SOLUTION will often also contain NEW joint and conditional distributions not directly provided by QUESTION.

* A SOLUTION contains a Modeling Error if any the definition or derivation of a NEW probability distribution contains erroneous probability formulas, indicating a potential error in student's knowledge.

* In NEW full joint distributions, which often result from multiplying several conditional distributions, erroneous formulas often has incorrect terms, or are missing or have extra terms. This can be a result of making incorrect independence assumptions are made that's not warranted by the graph in QUESTION (e.g substituting  $p(x) \cdot p(y)$ for $p(x, y)$ or $p(x)$ for $p(x | y)$, when the graph does not suggest independence between x and y). Sometimes the dependence between two variables $v_i$ and $v_j$ is evident only if you look at the full graph or a large enough subgraph (e.g. $v_i$ and $v_j$ are long descendents off of two separate branches off of a common ancestor $v_k$). Sometimes indepedence assumptions are warranted not by the graph structure, but by the numerical value of the distribution (e.g. when $p(V_i=v_i) = 1$ for some $v_i$).

* In NEW marginal joint distributions, which involves summing some full joint over values assignments, erroneous formulas often misses summing over some variables (e.g. incorrectly using $p(x) = \sum_y p(x,y,z)$ when it should have also summed over $z$), or did not sum over all values exhaustively (e.g. incorrectly using $p(x) = p(x,y=0,z=0) + p(x,y=0,z=1)$ which forgets to sum over other values of $y$).

* A SOLUTION can also contain a Modeling Error if they did not consider the intervention on $v_i$ properly. This can happen in ONLY two ways, 1) if the SOLUTION attempted to compute the distribution of $v_i$ from its parents, or 2) if the SOLUTION attempted to marginalize over $v_i$ (incorrect since $v_i$ is intervened on and set to a fixed value). 

What's NOT a Modeling Error? 
* It is NOT a Modeling Error when the student copied CPTs numerically incorrectly from the QUESTION, or when they multiply or add numbers to incorrect results. This includes misreading the CPTs. These are low-level copy and calculation errors, not Modeling Error.

* It is NOT a Modeling Error to ignore the intervention when the intervened on variable $v_i$ is NOT an ancestor of any of the question variable $v_j$. In that case, the intervention has no effect on $v_j$ and one should just compute $v_j$ in the original graph.

* Focus on probability formulas that are actually used to compute the solution. If the student makes an erroneous statement but does not make modeling errors in the actual formulas, do NOT count it as a Modeling Error.
"""


def format_system_prompt():
    return SYSTEM

def format_user_prompt(question, answer, error):
    return USER.format(question=question, answer=answer, error=error)

def annotate(question, answer, error, model):
    sysprompt = format_system_prompt()
    userprompt = format_user_prompt(question, answer, error)
    messages = [
        {'role': 'system', 'content': sysprompt},
        {'role': 'user', 'content': userprompt},
    ]
    print('[system]', sysprompt)
    print('[user]', userprompt)
    if "o3" or "o4" in model:
        output = client.chat.completions.create(messages=messages, model=model, max_completion_tokens=8192)
    else:
        output = client.chat.completions.create(messages=messages, temperature=0, model=model, max_completion_tokens=8192)
    annotation = output.choices[0].message.content
    return annotation

def load_question(i):
    with open(f'data/manual1/{i}.question.txt', 'rt') as f:
        question = f.read()
    return question

def load_answer(i):
    with open(f'data/manual1/{i}.answer.txt', 'rt') as f:
        answer = f.read()
    return answer

def annotate_manual_example(i, error, error_name, model="gpt-4o-mini"):
    question = load_question(i)
    answer = load_answer(i)
    ans = annotate(question, answer, error, model)
    with open(f'data/manual1/{i}.annotation.{model}.{error_name}.txt', 'wt') as f:
        print(ans, file=f)
    return ans
    
print('---')
print('[system]', format_system_prompt())
print('---')
print('[user]', format_user_prompt(load_question(1), load_answer(1), MODELING_ERROR))


---
[system] You are an expert grader that never makes mistakes. 

### Question Type
You will be given a step-by-step SOLUTION for a QUESTION that asks for the marginal distribution of some random variable $v_i$ under an intervention on some other variable $v_j$ ($i$ is allowed to equal $j$). 

### Catching Errors
You will be given the definition of an ERROR. Your task is to determine whether the SOLUTION made that ERROR. Justify your decision by pointing out EVIDENCE - locations in SOLUTION where the ERROR happens, and provide a short EXPLANATION.

### Formatting Response
Put your EVIDENCE, if any, within the <evidence></evidence> tag, then your EXPLANATION, if any, within the <explanation></explanation> tag, and finally put your judgement (choose from "Yes" or "No") in <judgement></judgement> tag, where Yes means it contains the ERROR defined above, and No means it does not contain the ERROR defined above. Note that a SOLUTION can contain multiple kinds of errors, please focus on jud

In [3]:
# for i in range(1, 11):
#     annotate_manual_example(i, MODELING_ERROR, 'model_err', model='gpt-4.1-mini')

In [8]:
import re
def parse_judgement(s):
    match = re.search(r"<judgement>(.*?)</judgement>", s, re.DOTALL | re.IGNORECASE)
    value = None
    if match:
        content = match.group(1).strip().lower()
        if content == "yes":
            value = 1
        elif content == "no":
            value = 0
        else:
            value = -1
    else:
        value = -2
    return value

In [None]:
from concurrent.futures import ThreadPoolExecutor
import re
from datasets import load_dataset
import numpy as np


def annotate_0719(error, error_name, model='gpt-4.1-mini', first=None):
    data = load_dataset('csv', data_files='data/0719_annotation.csv')['train']
    if first is not None:
        data = data.select(list(range(first)))
    column = []
    column_name = f"{error_name}_raw"
    column_futures = []
    with ThreadPoolExecutor(max_workers=32) as pool:
        for i in range(len(data)):
            question = data[i]['question']
            answer = data[i]['answer']
            ans = pool.submit(annotate, question, answer, error, model)
            column_futures.append(ans)
    for cf in column_futures:
        column.append(cf.result())
    data = data.add_column(column_name, column)
    data.to_parquet(f"data/0719_{error_name}.parquet")
    return data

annotation_dataset = annotate_0719( MODELING_ERROR, 'modeling_error', model='o4-mini', first=None)

annotation_dataset = annotation_dataset.map(lambda x: {'match': x["modeling_error_human"] == parse_judgement(x['modeling_error_raw'])})


print(np.mean(annotation_dataset['match']))

[system] You are an expert grader that never makes mistakes. 

### Question Type
You will be given a step-by-step SOLUTION for a QUESTION that asks for the marginal distribution of some random variable $v_i$ under an intervention on some other variable $v_j$ ($i$ is allowed to equal $j$). 

### Catching Errors
You will be given the definition of an ERROR. Your task is to determine whether the SOLUTION made that ERROR. Justify your decision by pointing out EVIDENCE - locations in SOLUTION where the ERROR happens, and provide a short EXPLANATION.

### Formatting Response
Put your EVIDENCE, if any, within the <evidence></evidence> tag, then your EXPLANATION, if any, within the <explanation></explanation> tag, and finally put your judgement (choose from "Yes" or "No") in <judgement></judgement> tag, where Yes means it contains the ERROR defined above, and No means it does not contain the ERROR defined above. Note that a SOLUTION can contain multiple kinds of errors, please focus on judging

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1106.09ba/s]
Map: 100%|██████████| 60/60 [00:00<00:00, 11446.29 examples/s]

0.85





In [19]:
from sklearn.metrics import f1_score, precision_score, recall_score


annotation_dataset = annotation_dataset.map(lambda x: {'modeling_error_pred': parse_judgement(x['modeling_error_raw'])})

precision = precision_score(annotation_dataset['modeling_error_human'], np.array(annotation_dataset['modeling_error_pred']) == 1)
print(f"precision Score: {precision:.4f}")
recall = recall_score(annotation_dataset['modeling_error_human'], np.array(annotation_dataset['modeling_error_pred']) == 1)
print(f"recall Score: {recall:.4f}")
f1 = f1_score(annotation_dataset['modeling_error_human'], np.array(annotation_dataset['modeling_error_pred']) == 1)
print(f"F1 Score: {f1:.4f}")
print(f'Format Error rate: {np.mean(np.array(annotation_dataset["modeling_error_pred"]) < 0)}')

print(f'Error rate: {np.mean(annotation_dataset["modeling_error_human"])}')
print(f'Error rate 7b: {np.mean(annotation_dataset.filter(lambda x: "7b" in x["system_name"])["modeling_error_human"])}')
print(f'Error rate 32b: {np.mean(annotation_dataset.filter(lambda x: "32b" in x["system_name"])["modeling_error_human"])}')
print(f'Error rate 7b pred: {np.mean(annotation_dataset.filter(lambda x: "7b" in x["system_name"])["modeling_error_pred"])}')
print(f'Error rate 32b pred: {np.mean(annotation_dataset.filter(lambda x: "32b" in x["system_name"])["modeling_error_pred"])}')

Map: 100%|██████████| 60/60 [00:00<00:00, 11022.66 examples/s]


precision Score: 0.8125
recall Score: 0.9286
F1 Score: 0.8667
Format Error rate: 0.016666666666666666
Error rate: 0.4666666666666667


Filter: 100%|██████████| 60/60 [00:00<00:00, 30142.32 examples/s]


Error rate 7b: 0.6


Filter: 100%|██████████| 60/60 [00:00<00:00, 32995.70 examples/s]

Error rate 32b: 0.3333333333333333
Error rate 7b pred: 0.7
Error rate 32b pred: 0.3





In [14]:
from datasets import load_dataset
annotation_dataset = load_dataset('parquet', data_files=f"data/0719_modeling_error.parquet")['train']
annotation_dataset = annotation_dataset.map(lambda x: {'modeling_error_pred': parse_judgement(x['modeling_error_raw'])})

for i in range(60):
    if annotation_dataset[i]['modeling_error_human'] == 1 and annotation_dataset[i]['modeling_error_pred'] == 1:
        print('---')
        print('[system]', format_system_prompt())
        print('---')
        print('[user]', format_user_prompt(annotation_dataset[i]['question'], annotation_dataset[i]['answer'], MODELING_ERROR))
        print('---')
        print('[judge]')
        print(annotation_dataset[i]['modeling_error_raw'])
        input("enter")

---
[system] You are an expert grader that never makes mistakes. 

### Question Type
You will be given a step-by-step SOLUTION for a QUESTION that asks for the marginal distribution of some random variable $v_i$ under an intervention on some other variable $v_j$ ($i$ is allowed to equal $j$). 

### Catching Errors
You will be given the definition of an ERROR. Your task is to determine whether the SOLUTION made that ERROR. Justify your decision by pointing out EVIDENCE - locations in SOLUTION where the ERROR happens, and provide a short EXPLANATION.

### Formatting Response
Put your EVIDENCE, if any, within the <evidence></evidence> tag, then your EXPLANATION, if any, within the <explanation></explanation> tag, and finally put your judgement (choose from "Yes" or "No") in <judgement></judgement> tag, where Yes means it contains the ERROR defined above, and No means it does not contain the ERROR defined above. Note that a SOLUTION can contain multiple kinds of errors, please focus on jud

KeyboardInterrupt: Interrupted by user

In [None]:
annotation_dataset = annotation_dataset.map(lambda x: {'match': x["formula error - human"] == parse_judgement(x['modeling_error_raw'])})

Map: 100%|██████████| 1/1 [00:00<00:00, 295.75 examples/s]


In [12]:
annotation_dataset['match']

[True]