In [1]:
import pandas as pd
import ast
from sentence_transformers import SentenceTransformer
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the cleaned data
df = pd.read_csv('nyctastematch-restaurant-recommender/data/processed/restaurants_data_merged_final.csv')

df['cuisine'] = df['cuisine'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

print(f"Loaded {len(df)} restaurants")
print(df.head())

Loaded 4598 restaurants
                     name  price  rating  review_count  \
0    123 Burger Shot Beer      1     3.0        1000.0   
1     One Stop Patty Shop      1     4.0          40.0   
2  108 Food Dried Hot Pot      2     3.5         139.0   
3                Cookshop      2     4.0        1000.0   
4        11 Hanover Greek      3     4.0         122.0   

                                             cuisine  \
0  [american, sportsbars, tradamerican, chicken_w...   
1              [bakery, caribbean, breakfast_brunch]   
2                                  [chinese, hotpot]   
3  [american, newamerican, breakfast_brunch, wine...   
4                        [greek, seafood, wine_bars]   

                                 address   display_phone comments  
0  738 10th Ave, Hells Kitchen, NY 10019  (212) 315-0123      NaN  
1   1708 Amsterdam Ave, Harlem, NY 10031  (212) 491-7466      NaN  
2   2794 Broadway, East Harlem, NY 10025  (917) 675-6878      NaN  
3   156 10th Ave, 

In [3]:
# Create a rich text description for each restaurant for better embeddings
def create_restaurant_description(row):
    description_parts = []
    
    # Restaurant name
    description_parts.append(f"Restaurant name: {row['name']}")
    
    # Cuisine types
    if isinstance(row['cuisine'], list) and len(row['cuisine']) > 0:
        cuisines = ', '.join(row['cuisine'])
        description_parts.append(f"Cuisine types: {cuisines}")
    
    # Price range
    price_labels = {1: "Budget-friendly", 2: "Moderate", 3: "Upscale", 4: "Fine dining"}
    if pd.notna(row['price']) and row['price'] in price_labels:
        description_parts.append(f"Price range: {price_labels[row['price']]}")
    
    # Rating
    if pd.notna(row['rating']):
        description_parts.append(f"Rating: {row['rating']}/5 stars")
    
    # Review count (popularity indicator)
    if pd.notna(row['review_count']):
        if row['review_count'] > 500:
            description_parts.append("Very popular restaurant")
        elif row['review_count'] > 100:
            description_parts.append("Popular restaurant")
    
    # Location
    if pd.notna(row['address']):
        description_parts.append(f"Location: {row['address']}")
    
    # Comments (if available)
    if pd.notna(row['comments']) and row['comments'] != 'N/A':
        description_parts.append(f"Reviews: {row['comments']}")
    
    return ". ".join(description_parts)

# Create descriptions for all restaurants
df['description'] = df.apply(create_restaurant_description, axis=1)

print("\nSample description:")
print(df['description'].iloc[0])


Sample description:
Restaurant name: 123 Burger Shot Beer. Cuisine types: american, sportsbars, tradamerican, chicken_wings. Price range: Budget-friendly. Rating: 3.0/5 stars. Very popular restaurant. Location: 738 10th Ave, Hells Kitchen, NY 10019


In [4]:
from sentence_transformers import SentenceTransformer

# Load embedding model

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

print("Generating embeddings for all restaurants...")

# Generate embeddings for all restaurant descriptions
restaurant_embeddings = embedding_model.encode(
    df['description'].tolist(),
    show_progress_bar=True,
    convert_to_numpy=True
)

print(f"Generated embeddings with shape: {restaurant_embeddings.shape}")

# Save embeddings for future use
np.save('restaurant_embeddings.npy', restaurant_embeddings)
print("Embeddings saved to restaurant_embeddings.npy")

Generating embeddings for all restaurants...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 144/144 [00:38<00:00,  3.76it/s]

Generated embeddings with shape: (4598, 384)
Embeddings saved to restaurant_embeddings.npy





In [5]:
import chromadb
from chromadb.config import Settings

# Initialize ChromaDB client
client = chromadb.Client(Settings(
    persist_directory="./chroma_db",
    anonymized_telemetry=False
))

# Create or get collection
try:
    collection = client.delete_collection("restaurants")
except:
    pass

collection = client.create_collection(
    name="restaurants",
    metadata={"description": "NYC Restaurant recommendations"}
)

# Add documents to ChromaDB
print("Adding restaurants to vector database...")

documents = df['description'].tolist()
ids = [str(i) for i in range(len(df))]
metadatas = df[['name', 'price', 'rating', 'address']].to_dict('records')

# Convert embeddings to list for ChromaDB
embeddings_list = restaurant_embeddings.tolist()

# Add to collection
collection.add(
    documents=documents,
    embeddings=embeddings_list,
    ids=ids,
    metadatas=metadatas
)

print(f"Added {len(documents)} restaurants to vector database")

Adding restaurants to vector database...
Added 4598 restaurants to vector database


In [19]:
import re
from typing import Dict, List, Optional, Tuple

class QueryParser:
    """
    Parse natural language queries to extract filters and preferences
    """
    
    def __init__(self):
        # Price keywords mapping
        self.price_keywords = {
            'cheap': [1],
            'affordable': [1, 2],
            'budget': [1],
            'budget-friendly': [1, 2],
            'inexpensive': [1],
            'economical': [1, 2],
            'moderate': [2],
            'mid-range': [2, 3],
            'expensive': [3, 4],
            'upscale': [3, 4],
            'fancy': [3, 4],
            'fine dining': [4],
            'luxury': [4],
            'high-end': [4],
            'splurge': [3, 4],
            'fancy': [4]
        }
        
        # Rating keywords mapping
        self.rating_keywords = {
            'excellent': 4.5,
            'amazing': 4.5,
            'outstanding': 4.5,
            'great': 4.0,
            'good': 3.5,
            'highly rated': 4.0,
            'top rated': 4.5,
            'best': 4.5,
            'decent': 3.0,
            'okay': 2.5,
            'popular': 3.5,
            'well-reviewed': 4.0
        }
        
        # Cuisine keywords 
        self.cuisine_keywords = [
            'italian', 'chinese', 'japanese', 'mexican', 'indian', 
            'thai', 'french', 'american', 'mediterranean', 'greek',
            'korean', 'vietnamese', 'spanish', 'middle_eastern',
            'caribbean', 'ethiopian', 'brazilian', 'peruvian',
            'sushi', 'pizza', 'burger', 'seafood', 'steakhouse',
            'vegetarian', 'vegan', 'bakery', 'cafe', 'diner',
            'asian_fusion', 'latin', 'soul_food', 'barbecue'
        ]
    
    def parse_query(self, query: str) -> Dict:
        """
        Parse natural language query and extract filters
        
        Returns:
            Dictionary with extracted filters and cleaned query
        """
        query_lower = query.lower()
        
        filters = {
            'price_filter': None,
            'min_rating': None,
            'cuisine_filter': [],
            'cleaned_query': query
        }
        
        # Extract price preferences
        filters['price_filter'] = self._extract_price(query_lower)
        
        # Extract rating preferences
        filters['min_rating'] = self._extract_rating(query_lower)
        
        # Extract cuisine preferences
        filters['cuisine_filter'] = self._extract_cuisines(query_lower)
        
        # Clean the query (remove filter keywords for better semantic search)
        filters['cleaned_query'] = self._clean_query(query_lower)
        
        return filters
    
    def _extract_price(self, query: str) -> Optional[List[int]]:
        """Extract price filter from query"""
        # Check for explicit dollar signs
        dollar_match = re.search(r'\$+', query)
        if dollar_match:
            dollar_count = len(dollar_match.group())
            return list(range(1, min(dollar_count + 1, 5)))
        
        # Check for price keywords
        for keyword, price_range in self.price_keywords.items():
            if keyword in query:
                return price_range
        
        # Check for "under $X" or "less than $X"
        under_match = re.search(r'(?:under|less than|below|max|maximum)\s*\$?(\d+)', query)
        if under_match:
            max_price = int(under_match.group(1))
            if max_price <= 15:
                return [1]
            elif max_price <= 30:
                return [1, 2]
            elif max_price <= 60:
                return [1, 2, 3]
            else:
                return [1, 2, 3, 4]
        
        return None
    
    def _extract_rating(self, query: str) -> Optional[float]:
        """Extract minimum rating from query"""
        # Check for explicit ratings like "4 stars" or "4+ stars"
        star_match = re.search(r'(\d+(?:\.\d+)?)\+?\s*(?:star|rating)', query)
        if star_match:
            return float(star_match.group(1))
        
        # Check for rating keywords
        for keyword, min_rating in self.rating_keywords.items():
            if keyword in query:
                return min_rating
        
        # Check for "above X rating" or "over X stars"
        above_match = re.search(r'(?:above|over|at least|minimum)\s*(\d+(?:\.\d+)?)', query)
        if above_match:
            return float(above_match.group(1))
        
        return None
    
    def _extract_cuisines(self, query: str) -> List[str]:
        """Extract cuisine preferences from query"""
        found_cuisines = []
        
        for cuisine in self.cuisine_keywords:
            # Check for exact matches or variations
            pattern = r'\b' + cuisine.replace('_', r'[\s_-]') + r'\b'
            if re.search(pattern, query):
                found_cuisines.append(cuisine)
        
        return found_cuisines
    
    def _clean_query(self, query: str) -> str:
        """
        Remove filter-related keywords to create a cleaner semantic query
        Keep the core intent (e.g., "romantic dinner", "quick lunch")
        """
        # Remove price-related words
        price_words = '|'.join(self.price_keywords.keys())
        query = re.sub(r'\b(?:' + price_words + r')\b', '', query, flags=re.IGNORECASE)
        
        # Remove rating-related words
        rating_words = '|'.join(self.rating_keywords.keys())
        query = re.sub(r'\b(?:' + rating_words + r')\b', '', query, flags=re.IGNORECASE)
        
        # Remove explicit filters
        query = re.sub(r'(?:under|less than|below|max|maximum|above|over|at least|minimum)\s*\$?\d+', '', query)
        query = re.sub(r'\d+\+?\s*(?:star|rating)s?', '', query)
        query = re.sub(r'\$+', '', query)
        
        # Clean up extra whitespace
        query = ' '.join(query.split())
        
        return query.strip()
    
    def print_parsed_query(self, filters: Dict):
        """Pretty print parsed query information"""
        print("\n" + "="*60)
        print("PARSED QUERY")
        print("="*60)
        
        if filters['price_filter']:
            price_labels = {1: "$", 2: "$$", 3: "$$$", 4: "$$$$"}
            prices = [price_labels[p] for p in filters['price_filter']]
            print(f"üí∞ Price Range: {', '.join(prices)}")
        
        if filters['min_rating']:
            print(f"‚≠ê Minimum Rating: {filters['min_rating']}/5")
        
        if filters['cuisine_filter']:
            print(f"üçΩÔ∏è  Cuisines: {', '.join(filters['cuisine_filter'])}")
        
        print(f"üîç Semantic Query: '{filters['cleaned_query']}'")
        print("="*60 + "\n")

In [26]:
def search_restaurants(query, top_k=5, verbose=True):
    """
    Intelligent restaurant search with automatic query parsing
    
    Args:
        query: Natural language query
        top_k: Number of results to return
        verbose: Print parsed query details
    
    Returns:
        List of restaurant recommendations
    """
    # Initialize parser
    parser = QueryParser()
    
    # Parse the query
    filters = parser.parse_query(query)
    
    # Print parsed information
    if verbose:
        parser.print_parsed_query(filters)
    
    # Use the cleaned query for semantic search
    search_query = filters['cleaned_query'] if filters['cleaned_query'] else query
    
    # Generate query embedding
    query_embedding = embedding_model.encode(search_query).tolist()
    
    # Build where clause for ChromaDB
    where_clause = None
    conditions = []
    
    if filters['price_filter']:
        conditions.append({"price": {"$in": filters['price_filter']}})
    if filters['min_rating']:
        conditions.append({"rating": {"$gte": filters['min_rating']}})
    
    if len(conditions) > 1:
        where_clause = {"$and": conditions}
    elif len(conditions) == 1:
        where_clause = conditions[0]
    
    # Query ChromaDB with more results for cuisine filtering
    n_results = top_k * 3 if filters['cuisine_filter'] else top_k
    
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=n_results,
        where=where_clause
    )
    
    # Format and filter results
    recommendations = []
    for i in range(len(results['ids'][0])):
        idx = int(results['ids'][0][i])
        restaurant = df.iloc[idx].to_dict()
        
        # Apply cuisine filter if specified
        if filters['cuisine_filter']:
            restaurant_cuisines = [c.lower() for c in restaurant['cuisine']]
            if not any(cuisine in restaurant_cuisines for cuisine in filters['cuisine_filter']):
                continue
        
        restaurant['similarity_score'] = 1 - results['distances'][0][i]
        recommendations.append(restaurant)
        
        if len(recommendations) >= top_k:
            break
    
    return recommendations

In [27]:
def recommend_restaurants(query, top_k=5):
    """
    Main recommendation function with intelligent query parsing
    """
    print(f"\n{'='*60}")
    print(f"USER QUERY: {query}")
    print(f"{'='*60}")
    
    # Get recommendations
    recommendations = search_restaurants(query, top_k=top_k, verbose=True)
    
    # Display results
    if not recommendations:
        print("No restaurants found matching your criteria.")
        print("Try relaxing some filters or using different keywords.")
        return []
    
    print(f"Found {len(recommendations)} restaurant(s):\n")
    
    for i, restaurant in enumerate(recommendations, 1):
        print(f"{i}. {restaurant['name']}")
        print(f"   {'‚≠ê' * int(restaurant['rating'])} {restaurant['rating']}/5 ({restaurant.get('review_count', 0):.0f} reviews)")
        print(f"   Cuisine: {', '.join(restaurant['cuisine'])}")
        print(f"   Price: {'$' * restaurant['price']}")
        print(f"   Address: {restaurant['address']}")
        if restaurant['display_phone'] != 'N/A':
            print(f"   Phone: {restaurant['display_phone']}")
        print(f"   Match Score: {restaurant['similarity_score']:.1%}")
        print()
    
    return recommendations

In [28]:
# Test cases
test_queries = [
    "affordable restaurant with decent rating",
    "I want a cheap Italian place with good reviews",
    "find me an upscale sushi restaurant",
    "budget-friendly Mexican food that's highly rated",
    "expensive French restaurant for anniversary dinner",
    "under $30 per person with at least 4 stars",
    "best pizza place that's not too pricey",
    "fine dining experience with excellent service",
    "quick lunch spot, nothing fancy, good rating",
    "vegetarian friendly restaurant, moderate price, 4+ stars"
]

print("\n" + "="*60)
print("TESTING INTELLIGENT QUERY PARSING")
print("="*60)

for query in test_queries[:3]:  # Test first 3
    results = recommend_restaurants(query, top_k=3)
    input("Press Enter to continue to next query...")


TESTING INTELLIGENT QUERY PARSING

USER QUERY: affordable restaurant with decent rating

PARSED QUERY
üí∞ Price Range: $, $$
‚≠ê Minimum Rating: 3.0/5
üîç Semantic Query: 'restaurant with rating'

Found 3 restaurant(s):

1. 21 Club
   ‚≠ê‚≠ê‚≠ê‚≠ê‚≠ê 5.0/5 (3262 reviews)
   Cuisine: american
   Price: $$
   Address: 21 West 52nd Street New York, NY  10019
   Phone: nan
   Match Score: 22.0%

2. Taste Kitchen
   ‚≠ê‚≠ê‚≠ê 3.0/5 (42 reviews)
   Cuisine: chinese
   Price: $$
   Address: 316 W 36th St, Garment District, NY 10018
   Phone: (646) 666-0531
   Match Score: 17.0%

3. Benchmark Restaurant
   ‚≠ê‚≠ê‚≠ê‚≠ê‚≠ê 5.0/5 (836 reviews)
   Cuisine: contemporary_american
   Price: $$
   Address: 339A Second St. Brooklyn, NY  11215
   Phone: nan
   Match Score: 16.7%


USER QUERY: I want a cheap Italian place with good reviews

PARSED QUERY
üí∞ Price Range: $
‚≠ê Minimum Rating: 3.5/5
üçΩÔ∏è  Cuisines: italian
üîç Semantic Query: 'i want a italian place with reviews'

Found 3 restaura

In [30]:
def interactive_restaurant_finder():
    """
    Interactive restaurant recommendation system
    """
    print("\n" + "="*60)
    print(" RESTAURANT RECOMMENDATION SYSTEM")
    print("="*60)
    print("\nExamples of what you can ask:")
    print("  ‚Ä¢ 'affordable Italian restaurant with good ratings'")
    print("  ‚Ä¢ 'upscale sushi place for date night'")
    print("  ‚Ä¢ 'cheap Mexican food under $20'")
    print("  ‚Ä¢ 'best burger spot, nothing fancy'")
    print("\nType 'quit' to exit\n")
    
    while True:
        query = input("What kind of restaurant are you looking for? ").strip()
        
        if query.lower() in ['quit', 'exit', 'q']:
            print("\nThanks for using the restaurant finder!")
            break
        
        if not query:
            print("Please enter a search query.\n")
            continue
        
        # Get recommendations
        recommendations = recommend_restaurants(query, top_k=5)
        
        if recommendations:
            feedback = input("\nWould you like details on any restaurant? (Enter number or press Enter to skip): ").strip()
            if feedback.isdigit() and 1 <= int(feedback) <= len(recommendations):
                selected = recommendations[int(feedback) - 1]
                print(f"\n {selected['name']}")
                print(f"   {selected['address']}")
                print(f"   {selected['display_phone']}")
        
        another = input("\nSearch for another restaurant? (yes/no): ").strip().lower()
        if another not in ['yes', 'y', '']:
            print("\n Thanks for using the restaurant finder!")
            break
        print("\n")

interactive_restaurant_finder()


 RESTAURANT RECOMMENDATION SYSTEM

Examples of what you can ask:
  ‚Ä¢ 'affordable Italian restaurant with good ratings'
  ‚Ä¢ 'upscale sushi place for date night'
  ‚Ä¢ 'cheap Mexican food under $20'
  ‚Ä¢ 'best burger spot, nothing fancy'

Type 'quit' to exit


USER QUERY: pizza at an affordable rate

PARSED QUERY
üí∞ Price Range: $, $$
üçΩÔ∏è  Cuisines: pizza
üîç Semantic Query: 'pizza at an rate'

Found 5 restaurant(s):

1. Lombardo's Pizza and Pasta
   ‚≠ê‚≠ê‚≠ê‚≠ê 4.0/5 (96 reviews)
   Cuisine: italian, pizza, vegetarian_friendly
   Price: $$
   Address: 1203 Mamaroneck Ave, White Plains, NY 10605-4804
   Phone: +1 914-615-9090
   Match Score: 1.4%

2. Joey Pepperoni's Pizza
   ‚≠ê‚≠ê‚≠ê‚≠ê 4.0/5 (90 reviews)
   Cuisine: pizza
   Price: $
   Address: 493 2nd Ave, Gramercy, NY 10016
   Phone: (212) 213-5558
   Match Score: -2.6%

3. Joey Pepperonis Pizza
   ‚≠ê‚≠ê‚≠ê 3.5/5 (127 reviews)
   Cuisine: pizza
   Price: $
   Address: 381 Broadway, Little Italy, NY 10013
   Phone: (2