In [3]:
import pandas as pd


In [None]:
from category_tree import CategoryTree
from category_agent import CategoryAssignmentAgent
import getpass
from pprint import pprint
from tqdm.notebook import tqdm
import json
import asyncio
from tqdm.asyncio import tqdm_asyncio

In [2]:
OPENAI_API_KEY = getpass.getpass()

In [5]:
df = pd.read_csv("categories_data.csv")

In [6]:
df.CATEGORY_NAME_HIERARCHY.values[:4]

array(['Beverages > Fruit & Vegetable Juices > Wellness Shots',
       'Beverages > Fruit & Vegetable Juices > Other Juices',
       'Apparel & Accessories > Clothing > Underwear & Socks > Shapewear',
       'Pantry > Packaged Fruit & Applesauce > Other Packaged Fruit'],
      dtype=object)

In [7]:
categories_hierarchy = df.CATEGORY_NAME_HIERARCHY.unique()

In [9]:
categories_hierarchy

array(['Beverages > Fruit & Vegetable Juices > Wellness Shots',
       'Beverages > Fruit & Vegetable Juices > Other Juices',
       'Apparel & Accessories > Clothing > Underwear & Socks > Shapewear',
       'Pantry > Packaged Fruit & Applesauce > Other Packaged Fruit',
       'Sporting Goods > Athletics > Water Sports',
       'Beverages > Drink Mixes > Energy & Hydration Mixes',
       'Household Supplies > Household Cleaning Supplies > Household Cleaning Products > Dish Soap',
       'Beverages > Drink Mixes > Protein Powder',
       'Beverages > Carbonated Soft Drinks > Cola > Zero Sugar Cola',
       'Pantry > Packaged Seafood > Other Packaged Seafood',
       'Snacks > Nuts & Seeds > Other Nuts',
       'Pantry > Cooking & Baking > Starches & Thickeners',
       'Apparel & Accessories > Clothing > Maternity Clothing',
       'Apparel & Accessories > Clothing > Sweaters & Sweatshirts',
       'Pantry > Rice & Grains > Rice Blends',
       'Apparel & Accessories > Clothing > Outerw

In [6]:
tree = CategoryTree()
for category in categories_hierarchy:
    tree.add_category_path(category)

# tree.display_tree()

In [7]:
agent = CategoryAssignmentAgent(tree, OPENAI_API_KEY)

In [8]:
index = 1
product_description = df.iloc[index, 0]
true_category = df.iloc[index, 10]
result = agent.assign_category(product_description)

In [9]:
print(result["product_description"])
print(true_category)
print(result["messages"][-1].content)

ole seoky pineap bottle
Beverages > Fruit & Vegetable Juices > Other Juices
{
    "category_path": "Beverages > Fruit & Vegetable Juices > Other Juices",
    "reasoning": "Step 1: The product name 'ole seoky pineap bottle' suggests a beverage, likely a pineapple-flavored drink in a bottle. Among the root categories, 'Beverages' is the most appropriate, as the product does not fit apparel, pantry, or other categories. Step 2: Within 'Beverages', the options include various drink types. Since the product likely contains pineapple, a fruit, 'Fruit & Vegetable Juices' is the best fit. Step 3: Within 'Fruit & Vegetable Juices', there are specific juice types, but none for pineapple. 'Other Juices' is the most suitable subcategory for juices not specifically listed. Step 4: 'Other Juices' is a leaf node, so the process stops here.",
    "confidence": "high",
    "is_complete": true,
    "steps": [
        {
            "level": 1,
            "category": "Beverages",
            "reasoning":

In [10]:
# Evaluate the agent's performance
async def predict_category(agent, df):
    """Evaluate the agent's performance on the dataset."""
    correct = 0
    total = len(df)

    async def process_row(row):
        product_description = row.ORIGINAL_ITEM_TEXT
        true_category = row.CATEGORY_NAME_HIERARCHY
        result = await agent.assign_category_async(product_description)
        predicted_category = result["messages"][-1].content
        # Load as json
        predicted_category = json.loads(predicted_category)
        input_text = result["product_description"]
        predicted_category = predicted_category["category_path"]
        return (input_text, predicted_category, true_category)

    tasks = [process_row(row) for row in df.itertuples()]
    results = []

    # Process rows concurrently with tqdm progress bar
    for result in tqdm_asyncio.as_completed(tasks, total=total):
        results.append(await result)

    return results


# Run the evaluation
import nest_asyncio

nest_asyncio.apply()

results = await predict_category(agent, df.sample(500))

100%|██████████| 500/500 [00:33<00:00, 14.90it/s] 


In [11]:
# Create dataframe of results
prediction_df = pd.DataFrame(results, columns=["product_description", "predicted_categories", "true_categories"])

In [12]:
prediction_df["true_cat_tier1"] = prediction_df["true_categories"].apply(
    lambda x: x.split(">")[0] if len(x.split(">")) > 0 else None
)
prediction_df["true_cat_tier2"] = prediction_df["true_categories"].apply(
    lambda x: x.split(">")[1] if len(x.split(">")) > 1 else None
)
prediction_df["true_cat_tier3"] = prediction_df["true_categories"].apply(
    lambda x: x.split(">")[2] if len(x.split(">")) > 2 else None
)
prediction_df["pred_cat_tier1"] = prediction_df["predicted_categories"].apply(
    lambda x: x.split(">")[0] if len(x.split(">")) > 0 else None
)
prediction_df["pred_cat_tier2"] = prediction_df["predicted_categories"].apply(
    lambda x: x.split(">")[1] if len(x.split(">")) > 1 else None
)
prediction_df["pred_cat_tier3"] = prediction_df["predicted_categories"].apply(
    lambda x: x.split(">")[2] if len(x.split(">")) > 2 else None
)

In [13]:
# accuracy of tier 1
print((prediction_df["true_cat_tier1"] == prediction_df["pred_cat_tier1"]).mean())
# accuracy of tier 2
print((prediction_df["true_cat_tier2"] == prediction_df["pred_cat_tier2"]).mean())
# accuracy of tier 3
print((prediction_df["true_cat_tier3"] == prediction_df["pred_cat_tier3"]).mean())
# full accuracy
print((prediction_df["true_categories"] == prediction_df["predicted_categories"]).mean())

0.73
0.582
0.486
0.534
