# Homework 3

## Step 1: Write queries

In [2]:
import pandas as pd

queries = pd.read_csv('dietary_queries.csv')

queries.head()

Unnamed: 0,id,query,dietary_restriction
0,1,I'm vegan but I really want to make something ...,vegan
1,2,Need a quick gluten-free breakfast. I hate egg...,gluten-free
2,3,Keto breakfast that I can meal prep for the week,keto
3,4,I'm dairy-free and also can't stand the taste ...,dairy-free
4,5,Vegetarian pizza but I don't like mushrooms or...,vegetarian


## Step 3: Split dataset

### Random splitting

In [3]:
from sklearn.model_selection import train_test_split

df = pd.read_json("labeled_traces.jsonl", lines=True)

train_size = 0.15
dev_size = 0.4
test_size = 0.45

train_df, test_df = train_test_split(df, train_size=0.15, random_state=42)

train_df.to_csv("train_traces.csv", index=False)
test_df.to_csv("test_traces.csv", index=False)

dev_df, test_df = train_test_split(test_df, test_size=0.45/(0.4+0.45), random_state=42)
print(f"train: {train_df.shape}, dev: {dev_df.shape}, test: {test_df.shape}")

# concat all the dataframes
train_df["split"] = "train"
dev_df["split"] = "dev"
test_df["split"] = "test"
df = pd.concat([train_df, dev_df, test_df])

stats = df.groupby(['split', 'label']).size().unstack(fill_value=0)
stats['total'] = stats.sum(axis=1)
stats

train: (15, 11), dev: (40, 11), test: (46, 11)


label,FAIL,PASS,total
split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dev,14,26,40
test,9,37,46
train,3,12,15


### Stratified splitting

In [4]:
train_df['label'].value_counts()

label
PASS    12
FAIL     3
Name: count, dtype: int64

In [5]:
from sklearn.model_selection import train_test_split

df = pd.read_json("labeled_traces.jsonl", lines=True)

train_size = 0.15
dev_size = 0.4
test_size = 0.45

train_df, test_df = train_test_split(df, train_size=0.15, shuffle=True, random_state=42, stratify=df['label'])

train_df.to_csv("train_traces.csv", index=False)
test_df.to_csv("test_traces.csv", index=False)

dev_df, test_df = train_test_split(test_df, test_size=0.45/(0.4+0.45), shuffle=True, random_state=42, stratify=test_df['label'])
print(f"train: {train_df.shape}, dev: {dev_df.shape}, test: {test_df.shape}")

# concat all the dataframes
train_df["split"] = "train"
dev_df["split"] = "dev"
test_df["split"] = "test"
df = pd.concat([train_df, dev_df, test_df])

stats = df.groupby(['split', 'label']).size().unstack(fill_value=0)
stats['total'] = stats.sum(axis=1)
stats['fail_rate'] = ((stats['FAIL'] / stats['total']) * 100).round(2)
stats

train: (15, 11), dev: (40, 11), test: (46, 11)


label,FAIL,PASS,total,fail_rate
split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dev,10,30,40,25.0
test,12,34,46,26.09
train,4,11,15,26.67


## Step 3: Develop Judge

create the judge by few shot prompting and then also evaluating it on the dev set.

In [25]:
from pydantic import BaseModel, Field
from enum import Enum
from litellm import completion
from dotenv import load_dotenv

load_dotenv("../.env")   


class Label(str, Enum):
    PASS = "PASS"
    FAIL = "FAIL"

class Judge(BaseModel):
    reasoning: str = Field(description="Detailed explanation of your evaluation, citing specific ingredients or method. No more than 30 words.")
    label: Label = Field(description="The label for the recipe response")

JUDGE_PROMPT_TEMPLATE = """
You are an expert nutritionist who understands the stricts requirements of different kinds of diets.
Your job is to evaluate whether the recipe response given adheres to the dietary requirements mentioned in the user query. 

## Guidelines

### Dietary Restrictions Definitions:
- Vegan: No animal products (meat, dairy, eggs, honey, etc.)
- Vegetarian: No meat or fish, but dairy and eggs are allowed
- Gluten-free: No wheat, barley, rye, or other gluten-containing grains
- Dairy-free: No milk, cheese, butter, yogurt, or other dairy products
- Keto: Very low carb (typically <20g net carbs), high fat, moderate protein
- Paleo: No grains, legumes, dairy, refined sugar, or processed foods
- Pescatarian: No meat except fish and seafood
- Kosher: Follows Jewish dietary laws (no pork, shellfish, mixing meat/dairy)
- Halal: Follows Islamic dietary laws (no pork, alcohol, proper slaughter)
- Nut-free: No tree nuts or peanuts
- Low-carb: Significantly reduced carbohydrates (typically <50g per day)
- Sugar-free: No added sugars or high-sugar ingredients
- Raw vegan: Vegan foods not heated above 118°F (48°C)
- Whole30: No grains, dairy, legumes, sugar, alcohol, or processed foods
- Diabetic-friendly: Low glycemic index, controlled carbohydrates
- Low-sodium: Reduced sodium content for heart health

### Evaluation Criteria:
- PASS: The recipe clearly adheres to the dietary preferences with appropriate ingredients and preparation methods
- FAIL: The recipe contains ingredients or methods that violate the dietary preferences
- Consider both explicit ingredients and cooking methods

### Examples:
Here are some examples of how to evaluate dietary adherence:
{examples}

## Evaluate the following

### User Query:
{query}

### Dietary Restrictions:
{dietary_restrictions}

### Recipe Response:
{response}
"""


def get_few_shot_examples(df: pd.DataFrame, num_pos: int, num_neg: int) -> list[dict]:
    fail_examples = df[df['label'] == 'FAIL'].sample(num_neg)
    pass_examples = df[df['label'] == 'PASS'].sample(num_pos)
    examples = pd.concat([fail_examples, pass_examples])
    return examples.to_dict(orient='records')

def judge(query: str, dietary_restriction: str, response: str) -> Judge:
    examples = get_few_shot_examples(train_df, 1, 3)

    examples_str = "\n".join([f"### Example {i+1}:\n Query: {example['query']}\n Recipe Response: {example['response']}\n Reasoning: {example['reasoning']}\n Label: {example['label']}\n" for i, example in enumerate(examples)])

    prompt = JUDGE_PROMPT_TEMPLATE.format(examples=examples_str, query=query, dietary_restrictions=dietary_restriction, response=response)

    response = completion(
        model="gpt-4.1-nano",
        messages=[{"role": "user", "content": prompt}],
        response_format=Judge,
        temperature=0.0
    )
    return Judge.model_validate_json(response.choices[0].message.content)


In [26]:
# random row from dev set
sample = dev_df.sample(1).to_dict(orient='records')[0]
sample
judge(sample['query'], sample['dietary_restriction'], sample['response'])

Judge(reasoning='The recipe is vegetarian, featuring tofu and chickpeas as protein sources, suitable for a bodybuilder. Ingredients and preparation align with dietary needs.', label=<Label.PASS: 'PASS'>)

### Evaluate the judge

In [47]:
from sklearn.metrics import confusion_matrix
from tqdm import tqdm


def evaluate_judge(df: pd.DataFrame):
    for index, row in tqdm(df.iterrows()):
        response = judge(row['query'], row['dietary_restriction'], row['response'])
        df.at[index, 'judge_reasoning'] = response.reasoning
        df.at[index, 'judge_label'] = response.label.value

    tn, fp, fn, tp = confusion_matrix(df['label'], df['judge_label'], labels=['FAIL', 'PASS']).ravel()
    tpr = float(tp / (tp + fn))
    tnr = float(tn / (tn + fp))
    return tpr, tnr, df

In [48]:
tpr, tnr, df = evaluate_judge(dev_df[:10])
tpr, tnr

10it [00:10,  1.02s/it]


(1.0, 1.0)