In [8]:
import os
import json
from openai import OpenAI
from tqdm import tqdm
from tabulate import tabulate

In [2]:
client = OpenAI(api_key=os.environ.get("OPENAI_INTERVIEW_API_KEY"))

In [16]:
test_cases = [
    "Show me edgy outfits for a night out",
    "I want a cozy sweater for fall",
]

In [14]:
few_shot_prompt = """You are a fashion query parser. Given a user's natural language request, extract:
- The main item category (e.g. dress, coat)
- Style tags like season, aesthetic, occasion

Examples:
Input: "I need a cute dress for summer"
→ Category: dress
→ Tags: cute, summer

Input: "Looking for comfy pants to wear at home"
→ Category: pants
→ Tags: comfy, home

Input: "{query}"
→ Category:"""

cot_prompt = """You are a fashion query parser. Your task is to extract the item category and relevant style tags from a user's query.

Let's think step by step.
Input: "{query}"

Step 1: Identify what type of clothing or item is mentioned.
Step 2: Identify descriptors, aesthetics, seasons, or occasions mentioned.
Step 3: Format the output as:
Category: <main item>
Tags: <comma-separated list>
"""

# --- Models + Prompt Strategies ---
strategies = [
    {"name": "few_shot", "prompt_template": few_shot_prompt},
    {"name": "cot", "prompt_template": cot_prompt},
]

models = ["gpt-4.5-preview-2025-02-27", "gpt-4o-2024-08-06"]

In [12]:
# --- Main Evaluation Loop ---
results = []
for test in tqdm(test_cases, desc="Running test cases"):
    for model in models:
        for strat in strategies:
            prompt = strat["prompt_template"].format(query=test)

            try:
                response = client.chat.completions.create(
                    model=model,
                    messages=[{"role": "user", "content": prompt}],
                    temperature=0,
                )

                content = response.choices[0].message.content
                lines = content.splitlines()
                category = ""
                tags = []

                for line in lines:
                    if "category" in line.lower():
                        category = line.split(":")[-1].strip()
                    if "tag" in line.lower():
                        tags = [t.strip() for t in line.split(":")[-1].split(",") if t.strip()]

                results.append({
                    "query": test,
                    "model": model,
                    "strategy": strat["name"],
                    "category": category,
                    "tags": tags,
                })

            except Exception as e:
                results.append({
                    "query": test,
                    "model": model,
                    "strategy": strat["name"],
                    "category": f"ERROR: {e}",
                    "tags": [],
                })

Running test cases: 100%|████████████████████████████████████████| 2/2 [00:14<00:00,  7.17s/it]


In [15]:
# --- Pretty Output ---
table = []
for r in results:
    if "o1" not in r["model"]:
        table.append([
            r["query"],
            r["model"].replace("gpt-", ""),
            r["strategy"],
            r["category"],
            ", ".join(r["tags"]),
        ])

print(tabulate(table, headers=["Query", "Model", "Strategy", "Category", "Tags"], tablefmt="grid"))


+---------------------------------------------+------------------------+------------+-------------+------------------------------+
| Query                                       | Model                  | Strategy   | Category    | Tags                         |
| I need a cute dress for summer              | 4.5-preview-2025-02-27 | few_shot   |             | cute, summer                 |
+---------------------------------------------+------------------------+------------+-------------+------------------------------+
| I need a cute dress for summer              | 4.5-preview-2025-02-27 | cot        | Dress       | Cute, Summer                 |
+---------------------------------------------+------------------------+------------+-------------+------------------------------+
| I need a cute dress for summer              | 4o-2024-08-06          | few_shot   |             | cute, summer                 |
+---------------------------------------------+------------------------+-----------

Initial evaluations show that COT works significantly better than few shot. Let's refine with more test cases.

In [23]:
cot_prompt = """You are a fashion query parser. Your task is to extract the item category and relevant style tags from a user's query.

Let's think step by step.
Input: "{query}"

Step 1: Identify what type of clothing or item is mentioned.
Step 2: Identify descriptors, aesthetics, seasons, or occasions mentioned.
Step 3: Format the output as:
Category: <main item>
Tags: <comma-separated list>
"""

cot_prompt2 = """You are an expert fashion stylist. Your task is to extract the item category and relevant style tags from a user's query.

Let's think step by step.
Input: "{query}"

Step 1: Identify what types of clothing or items are mentioned. If no specific item is mentioned, put unspecified.
Step 2: Identify descriptors, aesthetics, seasons, or occasions mentioned.
Step 3: Format the output as:
Category: <main item>
Tags: <comma-separated list>
"""

test_cases = [
    # "I need a cute dress for summer",
    # "Looking for something comfy to wear at home",
    # "Show me edgy outfits for a night out",
    # "I want a cozy sweater for fall",
    "Any recommendations for a job interview outfit?",
    "Something warm but stylish for winter",
    "Give me a wedding guest dress",
    "What’s a good outfit for a date night?",
    "I need a minimalist capsule wardrobe",
    "Looking for festival wear",
]
# --- Models + Prompt Strategies ---
strategies = [
    {"name": "cot", "prompt_template": cot_prompt},
    {"name": "cot2", "prompt_template": cot_prompt2},
]

models = ["gpt-4.5-preview-2025-02-27", "gpt-4o-2024-08-06"]

# --- Main Evaluation Loop ---
results = []
for test in tqdm(test_cases, desc="Running test cases"):
    for model in models:
        for strat in strategies:
            prompt = strat["prompt_template"].format(query=test)

            try:
                response = client.chat.completions.create(
                    model=model,
                    messages=[{"role": "user", "content": prompt}],
                    temperature=0,
                )

                content = response.choices[0].message.content
                lines = content.splitlines()
                category = ""
                tags = []

                for line in lines:
                    if "category" in line.lower():
                        category = line.split(":")[-1].strip()
                    if "tag" in line.lower():
                        tags = [t.strip() for t in line.split(":")[-1].split(",") if t.strip()]

                results.append({
                    "query": test,
                    "model": model,
                    "strategy": strat["name"],
                    "category": category,
                    "tags": tags,
                })

            except Exception as e:
                results.append({
                    "query": test,
                    "model": model,
                    "strategy": strat["name"],
                    "category": f"ERROR: {e}",
                    "tags": [],
                })


Running test cases: 100%|████████████████████████████████████████| 6/6 [01:04<00:00, 10.74s/it]


In [25]:
# --- Pretty Output ---
table = []
for r in results:
    if "o1" not in r["model"] and r["strategy"] == "cot2":
        table.append([
            r["query"],
            r["model"].replace("gpt-", ""),
            r["strategy"],
            r["category"],
            ", ".join(r["tags"]),
        ])

print(tabulate(table, headers=["Query", "Model", "Strategy", "Category", "Tags"], tablefmt="grid"))


+-------------------------------------------------+------------------------+------------+-------------+----------------------------------------------------------------+
| Query                                           | Model                  | Strategy   | Category    | Tags                                                           |
| Any recommendations for a job interview outfit? | 4.5-preview-2025-02-27 | cot2       | Unspecified | job interview, professional, formal, polished, business attire |
+-------------------------------------------------+------------------------+------------+-------------+----------------------------------------------------------------+
| Any recommendations for a job interview outfit? | 4o-2024-08-06          | cot2       | Unspecified | job interview, professional, formal, business attire           |
+-------------------------------------------------+------------------------+------------+-------------+----------------------------------------------------

Seems that GPT 4.5 performs better than 4o; retrieving more tags. Let's try some more prompts

In [26]:
cot_prompt = """You are a fashion query parser. Your task is to extract the item category and relevant style tags from a user's query.

Let's think step by step.
Input: "{query}"

Step 1: Identify what type of clothing or item is mentioned.
Step 2: Identify descriptors, aesthetics, seasons, or occasions mentioned.
Step 3: Format the output as:
Category: <main item>
Tags: <comma-separated list>
"""

cot_prompt2 = """You are an expert fashion stylist. Your task is to extract the item category and relevant style tags from a user's query.

Let's think step by step.
Input: "{query}"

Step 1: Identify what types of clothing or items are mentioned. If no specific item is mentioned, put unspecified.
Step 2: Identify descriptors, aesthetics, seasons, or occasions mentioned.
Step 3: Format the output as:
Category: <main item>
Tags: <comma-separated list>
"""

cot_prompt3 = """You are an expert fashion stylist. Your task is to extract the item category and relevant style tags from a user's query.

Let's think step by step.
Input: "{query}"

Step 1: Identify what types of clothing or items are mentioned. If no specific item is mentioned, leave it blank.
Step 2: Identify descriptors, aesthetics, seasons, or occasions mentioned.
Step 3: Format the output as:
Category: <comma-separated list>
Tags: <comma-separated list>
"""

test_cases = [
    # "I need a cute dress for summer",
    # "Looking for something comfy to wear at home",
    # "Show me edgy outfits for a night out",
    "I want a cozy sweater or jacket for fall",
    "Any recommendations for a job interview outfit?",
    # "Something warm but stylish for winter",
    "Give me a wedding guest dress",
    # "What’s a good outfit for a date night?",
    # "I need a minimalist capsule wardrobe",
    # "Looking for festival wear",
]
# --- Models + Prompt Strategies ---
strategies = [
    # {"name": "cot", "prompt_template": cot_prompt},
    {"name": "cot2", "prompt_template": cot_prompt2},
    {"name": "cot3", "prompt_template": cot_prompt3},
]

models = ["gpt-4.5-preview-2025-02-27"]

# --- Main Evaluation Loop ---
results = []
for test in tqdm(test_cases, desc="Running test cases"):
    for model in models:
        for strat in strategies:
            prompt = strat["prompt_template"].format(query=test)

            try:
                response = client.chat.completions.create(
                    model=model,
                    messages=[{"role": "user", "content": prompt}],
                    temperature=0,
                )

                content = response.choices[0].message.content
                lines = content.splitlines()
                category = ""
                tags = []

                for line in lines:
                    if "category" in line.lower():
                        category = line.split(":")[-1].strip()
                    if "tag" in line.lower():
                        tags = [t.strip() for t in line.split(":")[-1].split(",") if t.strip()]

                results.append({
                    "query": test,
                    "model": model,
                    "strategy": strat["name"],
                    "category": category,
                    "tags": tags,
                })

            except Exception as e:
                results.append({
                    "query": test,
                    "model": model,
                    "strategy": strat["name"],
                    "category": f"ERROR: {e}",
                    "tags": [],
                })


Running test cases: 100%|██████████████████████████████████████████████████████████████████████████████████| 3/3 [00:15<00:00,  5.09s/it]

+-------------------------------------------------+------------------------+------------+-----------------+---------------------------------------------------------------------+
| Query                                           | Model                  | Strategy   | Category        | Tags                                                                |
| I want a cozy sweater or jacket for fall        | 4.5-preview-2025-02-27 | cot2       | Sweater, Jacket | cozy, fall                                                          |
+-------------------------------------------------+------------------------+------------+-----------------+---------------------------------------------------------------------+
| Any recommendations for a job interview outfit? | 4.5-preview-2025-02-27 | cot2       | Unspecified     | job interview, professional, formal, polished, business-appropriate |
+-------------------------------------------------+------------------------+------------+-----------------+---




In [27]:
# --- Pretty Output ---
table = []
for r in results:
    table.append([
        r["query"],
        r["model"].replace("gpt-", ""),
        r["strategy"],
        r["category"],
        ", ".join(r["tags"]),
    ])

print(tabulate(table, headers=["Query", "Model", "Strategy", "Category", "Tags"], tablefmt="grid"))


+-------------------------------------------------+------------------------+------------+-----------------+-----------------------------------------------------------------------+
| Query                                           | Model                  | Strategy   | Category        | Tags                                                                  |
| I want a cozy sweater or jacket for fall        | 4.5-preview-2025-02-27 | cot2       | Sweater, Jacket | cozy, fall                                                            |
+-------------------------------------------------+------------------------+------------+-----------------+-----------------------------------------------------------------------+
| I want a cozy sweater or jacket for fall        | 4.5-preview-2025-02-27 | cot3       | sweater, jacket | cozy, fall                                                            |
+-------------------------------------------------+------------------------+------------+-----------

Finally, let's tune the temperature.

In [30]:
cot_prompt3 = """You are an expert fashion stylist. Your task is to extract the item category and relevant style tags from a user's query.

Let's think step by step.
Input: "{query}"

Step 1: Identify what types of clothing or items are mentioned. If no specific item is mentioned, leave it blank.
Step 2: Identify descriptors, aesthetics, seasons, or occasions mentioned.
Step 3: Format the output as:
Category: <comma-separated list>
Tags: <comma-separated list>
"""

test_cases = [
    # "I need a cute dress for summer",
    # "Looking for something comfy to wear at home",
    "Show me edgy outfits for a night out",
    "I want a cozy sweater or jacket for fall",
    "Any recommendations for a job interview outfit?",
    # "Something warm but stylish for winter",
    # "Give me a wedding guest dress",
    # "What’s a good outfit for a date night?",
    # "I need a minimalist capsule wardrobe",
    "Looking for festival wear",
    # "Show me blue dresses",
]
# --- Models + Prompt Strategies ---
strategies = [
    # {"name": "cot", "prompt_template": cot_prompt},
    # {"name": "cot2", "prompt_template": cot_prompt2},
    {"name": "cot3", "prompt_template": cot_prompt3},
]
temperatures = [0, 0.2, 0.4]

models = ["gpt-4.5-preview-2025-02-27"]

# --- Main Evaluation Loop ---
results = []
for test in tqdm(test_cases, desc="Running test cases"):
    for model in models:
        for strat in strategies:
            for temperature in temperatures:
                prompt = strat["prompt_template"].format(query=test)
    
                try:
                    response = client.chat.completions.create(
                        model=model,
                        messages=[{"role": "user", "content": prompt}],
                        temperature=temperature,
                    )
    
                    content = response.choices[0].message.content
                    lines = content.splitlines()
                    category = ""
                    tags = []
    
                    for line in lines:
                        if "category" in line.lower():
                            category = line.split(":")[-1].strip()
                        if "tag" in line.lower():
                            tags = [t.strip() for t in line.split(":")[-1].split(",") if t.strip()]
    
                    results.append({
                        "query": test,
                        "model": model,
                        "temperature": temperature,
                        "strategy": strat["name"],
                        "category": category,
                        "tags": tags,
                    })
    
                except Exception as e:
                    results.append({
                        "query": test,
                        "model": model,
                        "strategy": strat["name"],
                        "category": f"ERROR: {e}",
                        "tags": [],
                    })


Running test cases: 100%|██████████████████████████████████████████████████████████████████████████████████| 3/3 [00:27<00:00,  9.29s/it]


In [31]:
# --- Pretty Output ---
table = []
for r in results:
    table.append([
        r["query"],
        r["model"].replace("gpt-", ""),
        r["strategy"],
        r["temperature"],
        r["category"],
        ", ".join(r["tags"]),
    ])

print(tabulate(table, headers=["Query", "Model", "Strategy", "Category", "Tags"], tablefmt="grid"))


+------------------------------------------+------------------------+---------+------------+-----------------+-----------------------------+
|                                          | Query                  | Model   |   Strategy | Category        | Tags                        |
| Show me edgy outfits for a night out     | 4.5-preview-2025-02-27 | cot3    |        0   |                 | edgy, night out             |
+------------------------------------------+------------------------+---------+------------+-----------------+-----------------------------+
| Show me edgy outfits for a night out     | 4.5-preview-2025-02-27 | cot3    |        0.2 |                 | edgy, night out             |
+------------------------------------------+------------------------+---------+------------+-----------------+-----------------------------+
| Show me edgy outfits for a night out     | 4.5-preview-2025-02-27 | cot3    |        0.4 |                 | edgy, night out             |
+------------

In [33]:
cot_prompt3 = """You are an expert fashion stylist. Your task is to extract the item category and relevant style tags from a user's query.

Let's think step by step.
Input: "{query}"

Step 1: Identify what types of clothing or items are mentioned. If no specific items are mentioned, leave it blank.
Step 2: Identify descriptors, aesthetics, seasons, or occasions mentioned.
Step 3: Format the output as:
Category: <comma-separated list>
Tags: <comma-separated list>
"""

test_cases = [
    "I need a cute dress for summer",
    "Looking for something comfy to wear at home",
    "Show me edgy outfits for a night out",
    "I want a cozy sweater or jacket for fall",
    "Any recommendations for a job interview outfit?",
    "Something warm but stylish for winter",
    "Give me a wedding guest dress",
    "What’s a good outfit for a date night?",
    "I need a minimalist capsule wardrobe",
    "Looking for festival wear",
    "Show me blue dresses",
]
# --- Models + Prompt Strategies ---
strategies = [
    {"name": "cot3", "prompt_template": cot_prompt3},
]
temperatures = [0, 0.2]

models = ["gpt-4.5-preview-2025-02-27"]

# --- Main Evaluation Loop ---
results = []
for test in tqdm(test_cases, desc="Running test cases"):
    for model in models:
        for strat in strategies:
            for temperature in temperatures:
                prompt = strat["prompt_template"].format(query=test)
    
                try:
                    response = client.chat.completions.create(
                        model=model,
                        messages=[{"role": "user", "content": prompt}],
                        temperature=temperature,
                    )
    
                    content = response.choices[0].message.content
                    lines = content.splitlines()
                    category = ""
                    tags = []
    
                    for line in lines:
                        if "category" in line.lower():
                            category = line.split(":")[-1].strip()
                        if "tag" in line.lower():
                            tags = [t.strip() for t in line.split(":")[-1].split(",") if t.strip()]
    
                    results.append({
                        "query": test,
                        "model": model,
                        "temperature": temperature,
                        "strategy": strat["name"],
                        "category": category,
                        "tags": tags,
                    })
    
                except Exception as e:
                    results.append({
                        "query": test,
                        "model": model,
                        "strategy": strat["name"],
                        "category": f"ERROR: {e}",
                        "tags": [],
                    })


Running test cases: 100%|████████████████████████████████████████████| 11/11 [01:07<00:00,  6.12s/it]


In [34]:
# --- Pretty Output ---
table = []
for r in results:
    table.append([
        r["query"],
        r["model"].replace("gpt-", ""),
        r["strategy"],
        r["temperature"],
        r["category"],
        ", ".join(r["tags"]),
    ])

print(tabulate(table, headers=["Query", "Model", "Strategy", "Category", "Tags"], tablefmt="grid"))


+-------------------------------------------------+------------------------+---------+------------+-----------------+------------------------------------------------------------------------------+
|                                                 | Query                  | Model   |   Strategy | Category        | Tags                                                                         |
| I need a cute dress for summer                  | 4.5-preview-2025-02-27 | cot3    |        0   | Dress           | Cute, Summer                                                                 |
+-------------------------------------------------+------------------------+---------+------------+-----------------+------------------------------------------------------------------------------+
| I need a cute dress for summer                  | 4.5-preview-2025-02-27 | cot3    |        0.2 | Dress           | Cute, Summer                                                                 |
+--------------