In [1]:
from openai import OpenAI
from datetime import datetime, timezone
import requests
import time
import json
from typing import Iterable, List, Dict, Any, Union


In [2]:
import pandas as pd
df = pd.read_csv("../data/kaggle_restaurant_data.csv")
df.head()

  df = pd.read_csv("../data/kaggle_restaurant_data.csv")


Unnamed: 0,id,restaurant_name,score,ratings,restaurant_type,full_address,menu_category,menu_name,menu_item_description,price
0,5.0,Nelson Brothers Cafe (17th St N),4.7,22.0,"Breakfast and Brunch, Burgers, Sandwiches","314 17th St N, Birmingham, AL, 35203",Picked for you,Pork Chop Rice with Gravy Plate,,7.0
1,5.0,Nelson Brothers Cafe (17th St N),4.7,22.0,"Breakfast and Brunch, Burgers, Sandwiches","314 17th St N, Birmingham, AL, 35203",Picked for you,Full Sausage (2 pcs) with 2 Eggs,2 pieces.,7.25
2,5.0,Nelson Brothers Cafe (17th St N),4.7,22.0,"Breakfast and Brunch, Burgers, Sandwiches","314 17th St N, Birmingham, AL, 35203",Picked for you,Bacon and Egg with Cheese Breakfast Sandwich,,3.5
3,5.0,Nelson Brothers Cafe (17th St N),4.7,22.0,"Breakfast and Brunch, Burgers, Sandwiches","314 17th St N, Birmingham, AL, 35203",Picked for you,Double Cheese Burger,Grilled or fried patty with cheese on a bun.,3.25
4,5.0,Nelson Brothers Cafe (17th St N),4.7,22.0,"Breakfast and Brunch, Burgers, Sandwiches","314 17th St N, Birmingham, AL, 35203",Picked for you,Full Bacon (3 pcs) with 2 Eggs,3 pieces.,7.25


### Configuration

In [3]:
MODEL_NAME = "o4-mini"
SYSTEM_PROMPT = """
You are a Menu Data Extractor.

Input: A list of menu items. Each item includes:
  - restaurant_name
  - restaurant_type
  - menu_name
  - menu_item_description (may be empty)
  - menu_category (may be empty)

Your task is to return a JSON array of objects (in the same order as the input), where each object includes:
  - dish_base : string (the primary dish name, e.g., "pizza")
  - dish_flavor : string[] (up to 5 flavour descriptors, e.g., "pepperoni")
  - is_combo : boolean

Rules:
  • Use American English spelling, following the AP Stylebook and Merriam-Webster Dictionary as the reference.
  • All text must be in lowercase.
  • Translate any non-English dish names or terms into English before extracting features.
  
  • dish_base:
      – Remove size indicators (e.g., "small", "XL"), quantity counts (e.g., "3 pcs"), and side items.
      – If the base is unclear or ambiguous, use "unknown".
      – Use singular form (e.g., "sandwich" not "sandwiches").
      - You need to use menu_name and elaborate the context from restaurant_name, restaurant_type, menu_item_description

  • dish_flavor:
      – Each tag must be no more than two words.
      – Avoid duplicating the dish_base unless it adds meaning.
      – Use singular form (e.g., "egg" not "eggs").

  • is_combo:
      – Set to `true` if the item clearly bundles multiple components (e.g., main dish + sides + drink).

Output:
  • A raw JSON array only—no additional text or formatting.
  • Do not include triple backticks (```), Markdown, or extra labels.
  • The output must be valid JSON and preserve the original input order.

Example input:
[
  {
    "restaurant_name": "Pizza World",
    "restaurant_type": "Fast Food",
    "menu_name": "Deluxe Pepperoni Combo",
    "menu_item_description": "Large pepperoni pizza with garlic bread and 2 pops",
    "menu_category": "Combo"
  },
  {
    "restaurant_name": "Sushi House",
    "restaurant_type": "Japanese",
    "menu_name": "Edamame",
    "menu_item_description": "Steamed or grilled. Served with ponzu.",
    "menu_category": ""
  }
]

Expected output:
[
  {
    "dish_base": "pepperoni pizza",
    "dish_flavor": ["garlic bread"],
    "is_combo": true
  },
  {
    "dish_base": "edamame",
    "dish_flavor": ["steamed", "grilled", "ponzu"],
    "is_combo": false
  }
]
"""

In [4]:
class OpenAIConnector:
    def __init__(self, token_path="../credentials/open_ai_token.txt"):
        self.token = self._load_token(token_path)
        self.client = OpenAI(api_key=self.token)
        self.current_prompt = ''

    @staticmethod
    def _load_token(token_path):
        try:
            with open(token_path, "r") as f:
                token = f.read().strip()
                if not token:
                    raise ValueError("Token file is empty.")
                return token
        except FileNotFoundError:
            raise FileNotFoundError(f"Token file not found at {token_path}")
        except Exception as e:
            raise RuntimeError(f"Error reading token: {e}")
            
    # ─────────────────────────────────────────────────────────────
    #  PUBLIC –  main entry point
    # ─────────────────────────────────────────────────────────────
   
    def classify_batch(
        self,
        rows: Union[str, Iterable[Dict[str, str]]],
        model: str = MODEL_NAME,
        timeout_s: int = 60
    ) -> List[Dict[str, Any]]:
        start_time = time.time()
    
        if isinstance(rows, str):
            rows = json.loads(rows)
    
        # Make the message readable, clean JSON string
        user_msg = json.dumps(rows, indent=2)
    
        # Send one request for the entire batch
        resp = self.client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": user_msg},
            ],
            timeout=timeout_s
        )
    
        # Parse full JSON array
        content = resp.choices[0].message.content
        try:
            results = json.loads(content)
        except json.JSONDecodeError:
            raise ValueError("❌ Failed to parse response as JSON:\n" + content)
    
        elapsed = time.time() - start_time
        print(f"✅ Processed {len(rows)} rows in {elapsed:.2f} seconds.")
        print(f"⏱️ Average time per row: {elapsed / len(rows):.2f} seconds.")
    
        return results

        
    def get_usage_summary(self):
        headers = {
            "Authorization": f"Bearer {self.token}"
        }

        now = datetime.now(timezone.utc)
        start_date = now.replace(day=1).strftime("%Y-%m-%d")
        end_date = now.strftime("%Y-%m-%d")

        # 1. Get usage data
        usage_url = f"https://api.openai.com/v1/dashboard/billing/usage?start_date={start_date}&end_date={end_date}"
        usage_resp = requests.get(usage_url, headers=headers)
        usage_data = usage_resp.json()
        used_usd = usage_data.get("total_usage", 0) / 100.0  # from cents to dollars

        # 2. Get allowance (subscription limit)
        limits_url = "https://api.openai.com/v1/dashboard/billing/subscription"
        limits_resp = requests.get(limits_url, headers=headers)
        limits_data = limits_resp.json()
        hard_limit = limits_data.get("hard_limit_usd", 0)
        soft_limit = limits_data.get("soft_limit_usd", 0)

        remaining = hard_limit - used_usd

        return {
            "used_usd": round(used_usd, 2),
            "soft_limit_usd": round(soft_limit, 2),
            "hard_limit_usd": round(hard_limit, 2),
            "remaining_usd": round(remaining, 2)
        }

### Test on Sample Batch

In [5]:
import random

def qc(batch_index=None, batch_size=30):
    if batch_index is None:
        batch_index = random.randint(0, 1000) 

    df_batch = df[["restaurant_name", "restaurant_type", "menu_name", "menu_item_description", "menu_category"]]
    df_batch = df_batch[batch_size * batch_index : batch_size * (batch_index + 1)]

    # Initialize connector and classify
    oai = OpenAIConnector()
    result = oai.classify_batch(df_batch.to_dict(orient='records'))

    # Combine with original
    df_result = pd.DataFrame(result)
    df_combined = pd.concat([df_batch.reset_index(drop=True), df_result], axis=1)

    return df_combined
result = qc(10, batch_size=30)

✅ Processed 30 rows in 47.68 seconds.
⏱️ Average time per row: 1.59 seconds.


In [6]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

result

Unnamed: 0,restaurant_name,restaurant_type,menu_name,menu_item_description,menu_category,dish_base,dish_flavor,is_combo
0,Panera (521 Fieldstown Road),"Breakfast and Brunch, salad, Sandwich, Family Meals, Pizza, Healthy, American, Chicken",Cappuccino,"Regular (130 Cal.), Large (160 Cal.) Espresso with steamed milk, topped with a cap of foam. Allergens: Contains Milk",Breakfast,cappuccino,"[espresso, steamed milk, foam]",False
1,Panera (521 Fieldstown Road),"Breakfast and Brunch, salad, Sandwich, Family Meals, Pizza, Healthy, American, Chicken",Americano,10 Cal. Two shots of espresso made with our favorite espresso beans combined with hot water. Served hot. Allergens: none,Breakfast,americano,"[espresso, hot water]",False
2,Panera (521 Fieldstown Road),"Breakfast and Brunch, salad, Sandwich, Family Meals, Pizza, Healthy, American, Chicken",Espresso,10 Cal. A double espresso shot made with our favorite espresso beans. Served hot. Allergens: none,Breakfast,espresso,[hot],False
3,Panera (521 Fieldstown Road),"Breakfast and Brunch, salad, Sandwich, Family Meals, Pizza, Healthy, American, Chicken",Frozen Caramel Cold Brew,480 Cal. Caramel and an icy cold brew coffee blend topped with whipped cream and caramel syrup. Allergens: Contains Milk,Breakfast,cold brew,"[caramel, whipped cream, caramel syrup]",False
4,Panera (521 Fieldstown Road),"Breakfast and Brunch, salad, Sandwich, Family Meals, Pizza, Healthy, American, Chicken",Frozen Chocolate Cold Brew,440 Cal. Chocolate and an icy cold brew coffee blend topped with whipped cream and chocolate syrup. Allergens: Contains Milk,Breakfast,cold brew,"[chocolate, whipped cream, chocolate syrup]",False
5,Panera (521 Fieldstown Road),"Breakfast and Brunch, salad, Sandwich, Family Meals, Pizza, Healthy, American, Chicken",Iced Caramel Latte,"440 Cal. Freshly brewed espresso, milk and caramel served over ice. Allergens: Contains Milk",Breakfast,latte,"[caramel, espresso, milk]",False
6,Panera (521 Fieldstown Road),"Breakfast and Brunch, salad, Sandwich, Family Meals, Pizza, Healthy, American, Chicken",Iced Chocolate Latte,"400 Cal. Freshly brewed espresso, foamed milk and chocolate flavored syrup served over ice. Allergens: Contains Milk",Breakfast,latte,"[chocolate, espresso, foamed milk]",False
7,Panera (521 Fieldstown Road),"Breakfast and Brunch, salad, Sandwich, Family Meals, Pizza, Healthy, American, Chicken",Iced Chai Tea Latte,"290 Cal. Freshly brewed black tea with honey, vanilla, cardamom, cinnamon, ginger and foamed milk served over ice. Allergens: Contains Milk",Breakfast,latte,"[chai tea, honey, vanilla, cardamom, cinnamon]",False
8,Panera (521 Fieldstown Road),"Breakfast and Brunch, salad, Sandwich, Family Meals, Pizza, Healthy, American, Chicken",Iced Madagascar Vanilla Latte,"290 Cal. Freshly brewed espresso with foamed milk and Madagascar vanilla syrup, served over ice and topped with whipped cream. Allergens: Contains Milk",Breakfast,latte,"[madagascar vanilla, espresso, milk, whipped cream]",False
9,Panera (521 Fieldstown Road),"Breakfast and Brunch, salad, Sandwich, Family Meals, Pizza, Healthy, American, Chicken",Iced Caffe Latte,160 Cal. Freshly brewed espresso and milk served over ice. Allergens: Contains Milk,Breakfast,latte,"[espresso, milk]",False
