In [1]:
from openai import OpenAI
from datetime import datetime, timezone
import requests
import time
import json
from typing import Iterable, List, Dict, Any, Union


In [2]:
import pandas as pd
df = pd.read_csv("../data/kaggle_restaurant_data.csv")
df.head()

  df = pd.read_csv("../data/kaggle_restaurant_data.csv")


Unnamed: 0,id,restaurant_name,score,ratings,restaurant_type,full_address,menu_category,menu_name,menu_item_description,price
0,5.0,Nelson Brothers Cafe (17th St N),4.7,22.0,"Breakfast and Brunch, Burgers, Sandwiches","314 17th St N, Birmingham, AL, 35203",Picked for you,Pork Chop Rice with Gravy Plate,,7.0
1,5.0,Nelson Brothers Cafe (17th St N),4.7,22.0,"Breakfast and Brunch, Burgers, Sandwiches","314 17th St N, Birmingham, AL, 35203",Picked for you,Full Sausage (2 pcs) with 2 Eggs,2 pieces.,7.25
2,5.0,Nelson Brothers Cafe (17th St N),4.7,22.0,"Breakfast and Brunch, Burgers, Sandwiches","314 17th St N, Birmingham, AL, 35203",Picked for you,Bacon and Egg with Cheese Breakfast Sandwich,,3.5
3,5.0,Nelson Brothers Cafe (17th St N),4.7,22.0,"Breakfast and Brunch, Burgers, Sandwiches","314 17th St N, Birmingham, AL, 35203",Picked for you,Double Cheese Burger,Grilled or fried patty with cheese on a bun.,3.25
4,5.0,Nelson Brothers Cafe (17th St N),4.7,22.0,"Breakfast and Brunch, Burgers, Sandwiches","314 17th St N, Birmingham, AL, 35203",Picked for you,Full Bacon (3 pcs) with 2 Eggs,3 pieces.,7.25


### Configuration

In [3]:
ALLOWED_RESTAURANT_TYPES = [
    "acai shop", "afghani restaurant", "african restaurant", "american restaurant", "asian restaurant",
    "bagel shop", "bakery", "bar", "bar and grill", "barbecue restaurant", "brazilian restaurant",
    "breakfast restaurant", "brunch restaurant", "buffet restaurant", "cafe", "cafeteria", "candy store",
    "cat cafe", "chinese restaurant", "chocolate factory", "chocolate shop", "coffee shop", "confectionery",
    "deli", "dessert restaurant", "dessert shop", "diner", "dog cafe", "donut shop", "fast food restaurant",
    "fine dining restaurant", "food court", "french restaurant", "greek restaurant", "hamburger restaurant",
    "ice cream shop", "indian restaurant", "indonesian restaurant", "italian restaurant", "japanese restaurant",
    "juice shop", "korean restaurant", "lebanese restaurant", "meal delivery", "meal takeaway", "mediterranean restaurant",
    "mexican restaurant", "middle eastern restaurant", "pizza restaurant", "pub", "ramen restaurant", "restaurant",
    "sandwich shop", "seafood restaurant", "spanish restaurant", "steak house", "sushi restaurant", "tea house",
    "thai restaurant", "turkish restaurant", "vegan restaurant", "vegetarian restaurant", "vietnamese restaurant", "wine bar"
]

restaurant_type_list = ", ".join(f'"{r}"' for r in ALLOWED_RESTAURANT_TYPES)

In [33]:
MODEL_NAME = "o4-mini"
SYSTEM_PROMPT = """
You are a Menu Data Extractor.

Input: A list of menu items. Each item includes:
  - restaurant_name
  - restaurant_type
  - menu_name
  - menu_item_description (may be empty)
  - menu_category (may be empty)

Your task is to return a JSON array of objects (in the same order as the input), where each object includes:
  - dish_base : string (the primary dish name, e.g., "pizza")
  - dish_flavor : string[] (up to 5 flavour descriptors, e.g., "pepperoni")
  - is_combo : boolean
  - restaurant_type_std : string (standardized restaurant type)

Rules:
  • Use American English spelling, following the AP Stylebook and Merriam-Webster Dictionary as the reference.
  • All text must be in lowercase.
  • Translate any non-English dish names or terms into English before extracting features.
  
  • dish_base:
      - This is the main identity of the dish (e.g., "pizza", "lo mein", "fried rice").
      - It should be the central food item a customer would recognize the dish by.
      – Remove size indicators (e.g., "small", "XL"), quantity counts (e.g., "3 pcs"), and side items.
      – If the base is unclear or ambiguous, use "unknown".
      – Use singular form (e.g., "sandwich" not "sandwiches").
      - You need to use menu_name and elaborate the context from restaurant_name, restaurant_type, menu_item_description

  • dish_flavor:
      - Up to 5 descriptors of flavor, cooking style, toppings, sauces, etc.
      – Each tag must be no more than two words.
      – DO NOT repeat dish_base here (e.g., if dish_base is "shrimp fried rice", do not include "fried rice" again) unless it add meanings.
      – Use singular form (e.g., "egg" not "eggs")and lowercase.

  • is_combo:
      – Set to `true` if the item clearly bundles multiple components (e.g., main dish + sides + drink).

  • restaurant_type_std:
      – Must match one of the following values exactly: {restaurant_type_list}
      – Use the exact spelling and spacing and do NOT add or modify words (e.g., don't add "restaurant" if it's not part of the allowed type).
      – Start by checking whether the input field `restaurant_type` contains or closely matches any of the allowed types.
        – If it partially matches or contains keywords, normalize it accordingly (e.g., "fast food, burgers" → "fast food restaurant").
      – If no reliable match is found from `restaurant_type`, use other inputs to infer. 

Output:
  • A raw JSON array only—no additional text or formatting.
  • Do not include triple backticks (```), Markdown, or extra labels.
  • The output must be valid JSON and preserve the original input order.

Example input:
[
  {
    "restaurant_name": "Pizza World",
    "restaurant_type": "Fast Food",
    "menu_name": "Deluxe Pepperoni Combo",
    "menu_item_description": "Large pepperoni pizza with garlic bread and 2 pops",
    "menu_category": "Combo"
  },
  {
    "restaurant_name": "Sushi House",
    "restaurant_type": "Japanese",
    "menu_name": "Edamame",
    "menu_item_description": "Steamed or grilled. Served with ponzu.",
    "menu_category": ""
  }, 
  {
  "restaurant_name": "Golden Bites",
  "restaurant_type": "Fast Food, Burgers, Fries",
  "menu_name": "Bacon Cheeseburger Combo",
  "menu_item_description": "Served with fries and coke",
  "menu_category": "Burgers"
  }, 
  {
  "restaurant_name": "China Master Express",
  "restaurant_type": "Chinese, Comfort Food",
  "menu_name": "Shrimp Fried Rice",
  "menu_item_description": "Served with fried rice.",
  "menu_category": "Picked for you"
  }
]

Expected output:
[
  {
    "dish_base": "pepperoni pizza",
    "dish_flavor": ["garlic bread"],
    "is_combo": true, 
    "restaurant_type_std": "pizza restaurant"
  },
  {
    "dish_base": "edamame",
    "dish_flavor": ["steamed", "grilled", "ponzu"],
    "is_combo": false, 
    "restaurant_type_std": "japanese restaurant"
  }, 
  {
  "dish_base": "cheeseburger",
  "dish_flavor": ["bacon", "fries", "coke"],
  "is_combo": true,
  "restaurant_type_std": "fast food restaurant"
  }, 
  {
  "dish_base": "fried rice",
  "dish_flavor": [shrimp],
  "is_combo": false, 
  "restaurant_type_std": "chinese restaurant"
  }
]
"""

In [12]:
class OpenAIConnector:
    def __init__(self, token_path="../credentials/open_ai_token.txt"):
        self.token = self._load_token(token_path)
        self.client = OpenAI(api_key=self.token)
        self.current_prompt = ''

    @staticmethod
    def _load_token(token_path):
        try:
            with open(token_path, "r") as f:
                token = f.read().strip()
                if not token:
                    raise ValueError("Token file is empty.")
                return token
        except FileNotFoundError:
            raise FileNotFoundError(f"Token file not found at {token_path}")
        except Exception as e:
            raise RuntimeError(f"Error reading token: {e}")
            
    # ─────────────────────────────────────────────────────────────
    #  PUBLIC –  main entry point
    # ─────────────────────────────────────────────────────────────
   
    def classify_batch(
        self,
        rows: Union[str, Iterable[Dict[str, str]]],
        model: str = MODEL_NAME,
        timeout_s: int = 60
    ) -> List[Dict[str, Any]]:
        start_time = time.time()
    
        if isinstance(rows, str):
            rows = json.loads(rows)
    
        # Make the message readable, clean JSON string
        user_msg = json.dumps(rows, indent=2)
    
        # Send one request for the entire batch
        resp = self.client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": user_msg},
            ],
            timeout=timeout_s
        )
    
        # Parse full JSON array
        content = resp.choices[0].message.content
        try:
            results = json.loads(content)
        except json.JSONDecodeError:
            raise ValueError("❌ Failed to parse response as JSON:\n" + content)
    
        elapsed = time.time() - start_time
        print(f"✅ Processed {len(rows)} rows in {elapsed:.2f} seconds.")
        print(f"⏱️ Average time per row: {elapsed / len(rows):.2f} seconds.")
    
        return results

        
    def get_usage_summary(self):
        headers = {
            "Authorization": f"Bearer {self.token}"
        }

        now = datetime.now(timezone.utc)
        start_date = now.replace(day=1).strftime("%Y-%m-%d")
        end_date = now.strftime("%Y-%m-%d")

        # 1. Get usage data
        usage_url = f"https://api.openai.com/v1/dashboard/billing/usage?start_date={start_date}&end_date={end_date}"
        usage_resp = requests.get(usage_url, headers=headers)
        usage_data = usage_resp.json()
        used_usd = usage_data.get("total_usage", 0) / 100.0  # from cents to dollars

        # 2. Get allowance (subscription limit)
        limits_url = "https://api.openai.com/v1/dashboard/billing/subscription"
        limits_resp = requests.get(limits_url, headers=headers)
        limits_data = limits_resp.json()
        hard_limit = limits_data.get("hard_limit_usd", 0)
        soft_limit = limits_data.get("soft_limit_usd", 0)

        remaining = hard_limit - used_usd

        return {
            "used_usd": round(used_usd, 2),
            "soft_limit_usd": round(soft_limit, 2),
            "hard_limit_usd": round(hard_limit, 2),
            "remaining_usd": round(remaining, 2)
        }

### Test on Sample Batch

In [34]:
import random

def qc(batch_index=None, batch_size=30):
    if batch_index is None:
        batch_index = random.randint(0, 1000) 

    df_batch = df[["restaurant_name", "restaurant_type", "menu_name", "menu_item_description", "menu_category"]]
    df_batch = df_batch[batch_size * batch_index : batch_size * (batch_index + 1)]

    # Initialize connector and classify
    oai = OpenAIConnector()
    result = oai.classify_batch(df_batch.to_dict(orient='records'))

    # Combine with original
    df_result = pd.DataFrame(result)
    df_combined = pd.concat([df_batch.reset_index(drop=True), df_result], axis=1)

    return df_combined
result = qc(40, batch_size=30)

✅ Processed 30 rows in 43.46 seconds.
⏱️ Average time per row: 1.45 seconds.


In [35]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

result

Unnamed: 0,restaurant_name,restaurant_type,menu_name,menu_item_description,menu_category,dish_base,dish_flavor,is_combo,restaurant_type_std
0,Dreamcakes Bakery,"Bakery, Desserts, Exclusive to Eats",Brownie,Egg and dairy.,Desserts,brownie,"[egg, dairy]",False,bakery
1,Dreamcakes Bakery,"Bakery, Desserts, Exclusive to Eats",Mini Oatmeal Creme Pie,Dairy and egg.,Desserts,oatmeal creme pie,"[dairy, egg]",False,bakery
2,Dreamcakes Bakery,"Bakery, Desserts, Exclusive to Eats",Large Oatmeal Creme Pie,Dairy and egg.,Desserts,oatmeal creme pie,"[dairy, egg]",False,bakery
3,Dreamcakes Bakery,"Bakery, Desserts, Exclusive to Eats",Big Cookie,Egg and dairy.,Desserts,cookie,"[egg, dairy]",False,bakery
4,China Master Express,"Chinese, Comfort Food",Mongolian Beef,Served with fried rice. dishes feature sliced sirloin beef and home-made sauce.,Picked for you,beef,"[mongolian, sirloin, homemade sauce, fried rice]",False,chinese restaurant
5,China Master Express,"Chinese, Comfort Food",Beef Broccoli,Served with fried rice. dishes feature sliced sirloin beef and home-made sauce.,Picked for you,beef broccoli,"[sirloin, homemade sauce, fried rice]",False,chinese restaurant
6,China Master Express,"Chinese, Comfort Food",Shrimp Broccoli,Served with fried rice.,Picked for you,shrimp broccoli,[fried rice],False,chinese restaurant
7,China Master Express,"Chinese, Comfort Food",Pepper Steak,Served with fried rice. dishes feature sliced sirloin beef and home-made sauce.,Picked for you,pepper steak,"[sirloin, homemade sauce, fried rice]",False,chinese restaurant
8,China Master Express,"Chinese, Comfort Food",Sweet and Sour Shrimp,Served with fried rice.,Picked for you,shrimp,"[sweet and sour, fried rice]",False,chinese restaurant
9,China Master Express,"Chinese, Comfort Food",Chicken Fried Rice,,Fried Rice,fried rice,[chicken],False,chinese restaurant


### Test on Standardized Restaurant Type

In [36]:
def test_restaurant_type_validity(df, allowed_types, column_name="restaurant_type_std"):
    
    values = df[column_name].dropna().str.strip().str.lower().unique()
    allowed_set = set([v.strip().lower() for v in allowed_types])
    
    invalid_values = set(values) - allowed_set
    passed = len(invalid_values) == 0

    if passed:
        print("✅ All restaurant_type_std values are valid.")
    else:
        print("❌ Invalid restaurant_type_std values found:")
        for v in invalid_values:
            print("   -", v)
    
    return passed, invalid_values

test_restaurant_type_validity(result, ALLOWED_RESTAURANT_TYPES)

✅ All restaurant_type_std values are valid.


(True, set())