In [7]:
from openai import OpenAI
from dotenv import load_dotenv
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import json
import time
import requests
import random

load_dotenv()
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

pd.set_option('display.max_rows', 100)

In [37]:
temp = pd.read_csv('data/gmv/gmv_calculation.csv')

In [25]:
df = pd.read_csv('data/fetchGPT.csv')

# Get 50 entries each from Costco and Whole Foods Market
# costco_df = df[df['SAMPLE_STORE'] == 'COSTCO'].head(500)
# wholefoods_df = df[df['SAMPLE_STORE'] == 'WHOLE FOODS MARKET'].head(500)

# Combine the two dataframes
# temp = pd.concat([costco_df, wholefoods_df], ignore_index=True)

# Verify the distribution
# print("Store distribution:")
# print(temp['SAMPLE_STORE'].value_counts())
temp = df

In [38]:

# --- Assumptions: ---
# 1. `temp` is your pandas DataFrame with columns:
#      - 'ORIGINAL_ITEM_TEXT'
#      - 'SAMPLE_STORE'
#      - optionally 'PRODUCT_NUMBER'
# 2. `client` is already initialized and authenticated, e.g.:
#      client = OpenAI(api_key="YOUR_KEY")

results = []

for idx, row in temp.iterrows():
    original_item_text = row['ORIGINAL_ITEM_TEXT']
    sample_store      = row['SAMPLE_STORE']
    product_number    = row.get('PRODUCT_NUMBER', 'N/A')
    median_price      = row.get('MEDIAN(PAM.FINAL_PRICE)', 'N/A')
    
    prompt = f'''
    # Role and Objective
    You are an AI product description analyzer tasked with standardizing and expanding abbreviated product descriptions into clear, structured data. Your goal is to identify brands, categories, and expand abbreviated text while maintaining accuracy.

    # Instructions
    - Analyze the given abbreviated product description
    - Expand abbreviations without adding interpretive content
    - Identify the most likely brand based on text and product number
    - Categorize the product based on expanded description
    - Provide confidence scores for brand and category predictions

    ## Sub-categories for more detailed instructions
    - Expand what is directly implied in the text (e.g., "gl" to "glass", "bl" to "bottle"), unless there are specific annotations like counts or sizes that can be verified through web search, or brand.
    - Assign confidence scores from high/medium/low based on clarity of information
    - Consider store context when determining brand and category

    # Reasoning Steps
    1. Expand Abbreviations
       - Identify common product abbreviations
       - Convert to standard product terminology
       - Maintain original meaning without interpretation
       - Expand acronyms to full words (e.g. "CRV" to "California Redemption Value")
       - The format should be: brand name, product type, size, packaging, provided that this information is available through web search. 

    2. Brand Analysis
       - Look for brand indicators in text
       - Cross reference with online databases or product catalogs
       - Consider store-specific context
       - Assess confidence in brand identification
       - Standardize low confidence brands to "Brand Not Known" if brand is not clear

    3. Category Assignment
       - Analyze product characteristics
       - Determine product type
       - Assign to the closest predefined category in the available category taxonomy
       - Assign confidence based on clarity

       Available Category Taxonomy:
        - Savings & Coupons
        - Electronics
        - Home & Garden
        - Pantry
        - Beverages
        - Office & School
        - Household Supplies
        - Health & Wellness
        - Apparel & Accessories
        - Meat & Seafood
        - Frozen
        - Baby & Toddler
        - Vehicles & Parts
        - Beauty
        - Deli & Bakery
        - Scientific & Laboratory
        - Animals & Pet Supplies
        - Restaurant
        - Membership and Perks
        - Alcohol
        - Sporting Goods
        - Snacks
        - Commodities
        - Luggage & Bags
        - Media
        - Retail
        - Arts & Entertainment
        - Fees
        - Dairy
        - Toys & Games
        - Mature
        - Software
        - Produce


    # Output Format
    JSON structure with:
    {{
        "brand": "Predicted brand",
        "brand_score": "Confidence score (high/medium/low)",
        "category": "Predicted category",
        "category_score": "Confidence score (high/medium/low)",
        "expanded_description": "Expanded product description",
        "upc": "UPC if found in text or through product lookup, 'N/A' if not found",
        "reasoning": "Reasoning for predictions and description, summarized in 50 words or less"
    }}

    # Examples
    ## Example 1
    Input: "campari 12oz gl bl"
    Output: {{
        "brand": "Campari",
        "brand_score": high,
        "category": "Alcohol",
        "category_score": high,
        "expanded_description": "Campari 12oz Glass Bottle",
        "upc": "N/A",
        "reasoning": "Clear brand name 'Campari' present. Common abbreviations 'gl bl' clearly indicate glass bottle. Spirit category evident from brand."
    }}

    ## Example 2
    Input: "18 ct eggs 2pk"
    Output: {{
        "brand": "Sauders",
        "brand_score": low,
        "category": "Dairy",
        "category_score": high,
        "expanded_description": "Sauders Eggs Large White 18 Count 2 Pack",
        "upc": "033604535272",
        "reasoning": "Product number 1008 at retailer COSTCO leads to Sauders Large Eggs."
    }}

    # Context
    Here is the product information:
    - Original Item Text: "{original_item_text}"
    - Store: "{sample_store}"
    - Product Number: "{product_number}" (if applicable)
    - Median Price: "${median_price}" (if applicable)
    
    # Additional Considerations
    - Use price as a signal for brand tier (premium/standard/value)
    - Consider price ranges typical for the category
    - Factor price into confidence scores when relevant

    # Final instructions and prompt to think step by step
    1. First, expand only the abbreviated terms in the original text
    2. Then identify brand based on expanded text
    3. Finally, categorize the product based on the complete information
    4. Provide clear reasoning for each decision
    5. Find the UPC if available in the text or through product lookup
    6. Return structured JSON with confidence scores
    '''
    
    # Common Abbreviations and Terms
    # [Reserved for future implementation]

    # Store-Specific Categories
    # [Reserved for future implementation]

    # Product Naming Conventions
    # [Reserved for future implementation]

    delay = random.uniform(0.1, 0.5)  # Base delay + random jitter
    time.sleep(delay)

    # Send the prompt to the GPT model
    response = client.responses.create(
        model="gpt-4.1",
        tools=[{"type": "web_search_preview"}],
        input=prompt, 
        temperature=0.0
    )
    
    # Extract the JSON payload from the response
    output_text = response.output_text
    start = output_text.find('{')
    end   = output_text.rfind('}') + 1
    
    if start == -1 or end <= start:
        print(f"[Row {idx}] No JSON found. Output preview:\n{output_text[:200]}...\n")
        continue
    
    json_text = output_text[start:end]
    try:
        parsed = json.loads(json_text)
    except json.JSONDecodeError as e:
        print(f"[Row {idx}] JSON parsing error: {e}\nPayload:\n{json_text}\n")
        continue


    results.append({
        "ORIGINAL_ITEM_TEXT":   original_item_text,
        "SAMPLE_STORE":         sample_store,
        "PRODUCT_NUMBER":       product_number,
        "BRAND":                parsed.get("brand", "N/A"),
        "BRAND_SCORE":          parsed.get("brand_score", "N/A"),
        "CATEGORY":             parsed.get("category", "N/A"),
        "CATEGORY_SCORE":       parsed.get("category_score", "N/A"),
        "EXPANDED_DESCRIPTION": parsed.get("expanded_description", "N/A"),
        "REASONING_SNIPPET":    parsed.get("reasoning", "N/A"),
        "UPCs":                 parsed.get("upc", "N/A")
    })
    

# Convert to DataFrame
results_df = pd.DataFrame(results)

# (Optional) Save to CSV
# results_df.to_csv('data/fetchGPT_results.csv', index=False)

# Show the first few rows
results_df.head()

Unnamed: 0,ORIGINAL_ITEM_TEXT,SAMPLE_STORE,PRODUCT_NUMBER,BRAND,BRAND_SCORE,CATEGORY,CATEGORY_SCORE,EXPANDED_DESCRIPTION,REASONING_SNIPPET,UPCs
0,ny bottle de,COSTCO,,Built NY,high,Home & Garden,high,Built NY Stainless Steel Water Bottle,The abbreviation 'ny' likely refers to 'Built ...,
1,pottingsoil,COSTCO,1372969.0,Miracle-Gro,high,Home & Garden,high,Miracle-Gro Organic Choice Potting Mix 50 Quart,The product number 1372969 at Costco correspon...,
2,straw bana,COSTCO,1902754.0,Brand Not Known,low,Beverages,high,Strawberry Banana Smoothie,The term 'straw bana' likely refers to 'strawb...,
3,ks frnch fry,COSTCO,9999699.0,Kirkland Signature,high,Frozen,high,Kirkland Signature French Fries Extra-Crispy 5...,The abbreviation 'ks' corresponds to 'Kirkland...,96619999699.0
4,candy grapes,COSTCO,1073076.0,Brand Not Known,low,Produce,high,Cotton Candy Grapes,The product description 'candy grapes' refers ...,


In [39]:
# Merge the results_df with temp DataFrame
merged_df = pd.merge(
    temp, 
    results_df,
    on=['ORIGINAL_ITEM_TEXT', 'SAMPLE_STORE', 'PRODUCT_NUMBER'],
    how='left'
)

# Display the merged results
merged_df

# Optionally save to CSV
# merged_df.to_csv('data/merged_results.csv', index=False)


merged_df.to_csv('data/gmv/gmv.csv', index=False)

In [17]:
temp = pd.read_csv('data/blind_test/blind_test.csv')
temp['PRODUCT_NUMBER'] = pd.to_numeric(temp['PRODUCT_NUMBER'], errors='coerce').astype('Int64')
temp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   ORIGINAL_ITEM_TEXT  100 non-null    object
 1   PRODUCT_NUMBER      49 non-null     Int64 
 2   SAMPLE_STORE        100 non-null    object
 3   SAMPLE_RECEIPT      100 non-null    object
 4   ITEM_COUNT          100 non-null    int64 
dtypes: Int64(1), int64(1), object(3)
memory usage: 4.1+ KB
