In [2]:
import pandas as pd
import ast
from sentence_transformers import SentenceTransformer
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Load the cleaned data
df = pd.read_csv('nyctastematch-restaurant-recommender/data/processed/restaurants_data_merged_final.csv')

# Parse cuisine column from string to list
df['cuisine'] = df['cuisine'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

print(f"Loaded {len(df)} restaurants")
print(df.head())

Loaded 4598 restaurants
                     name  price  rating  review_count  \
0    123 Burger Shot Beer      1     3.0        1000.0   
1     One Stop Patty Shop      1     4.0          40.0   
2  108 Food Dried Hot Pot      2     3.5         139.0   
3                Cookshop      2     4.0        1000.0   
4        11 Hanover Greek      3     4.0         122.0   

                                             cuisine  \
0  [american, sportsbars, tradamerican, chicken_w...   
1              [bakery, caribbean, breakfast_brunch]   
2                                  [chinese, hotpot]   
3  [american, newamerican, breakfast_brunch, wine...   
4                        [greek, seafood, wine_bars]   

                                 address   display_phone comments  
0  738 10th Ave, Hells Kitchen, NY 10019  (212) 315-0123      NaN  
1   1708 Amsterdam Ave, Harlem, NY 10031  (212) 491-7466      NaN  
2   2794 Broadway, East Harlem, NY 10025  (917) 675-6878      NaN  
3   156 10th Ave, 

In [8]:
# Create a rich text description for each restaurant for better embeddings
def create_restaurant_description(row):
    """
    Create a comprehensive text description for embedding
    """
    description_parts = []
    
    # Restaurant name
    description_parts.append(f"Restaurant name: {row['name']}")
    
    # Cuisine types
    if isinstance(row['cuisine'], list) and len(row['cuisine']) > 0:
        cuisines = ', '.join(row['cuisine'])
        description_parts.append(f"Cuisine types: {cuisines}")
    
    # Price range
    price_labels = {1: "Budget-friendly", 2: "Moderate", 3: "Upscale", 4: "Fine dining"}
    if pd.notna(row['price']) and row['price'] in price_labels:
        description_parts.append(f"Price range: {price_labels[row['price']]}")
    
    # Rating
    if pd.notna(row['rating']):
        description_parts.append(f"Rating: {row['rating']}/5 stars")
    
    # Review count (popularity indicator)
    if pd.notna(row['review_count']):
        if row['review_count'] > 500:
            description_parts.append("Very popular restaurant")
        elif row['review_count'] > 100:
            description_parts.append("Popular restaurant")
    
    # Location
    if pd.notna(row['address']):
        description_parts.append(f"Location: {row['address']}")
    
    # Comments (if available)
    if pd.notna(row['comments']) and row['comments'] != 'N/A':
        description_parts.append(f"Reviews: {row['comments']}")
    
    return ". ".join(description_parts)

# Create descriptions for all restaurants
df['description'] = df.apply(create_restaurant_description, axis=1)

print("\nSample description:")
print(df['description'].iloc[0])


Sample description:
Restaurant name: 123 Burger Shot Beer. Cuisine types: american, sportsbars, tradamerican, chicken_wings. Price range: Budget-friendly. Rating: 3.0/5 stars. Very popular restaurant. Location: 738 10th Ave, Hells Kitchen, NY 10019


In [9]:
from sentence_transformers import SentenceTransformer

# Load embedding model
# Using 'all-MiniLM-L6-v2' - fast and efficient
# Alternative: 'all-mpnet-base-v2' - more accurate but slower
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

print("Generating embeddings for all restaurants...")

# Generate embeddings for all restaurant descriptions
restaurant_embeddings = embedding_model.encode(
    df['description'].tolist(),
    show_progress_bar=True,
    convert_to_numpy=True
)

print(f"Generated embeddings with shape: {restaurant_embeddings.shape}")

# Save embeddings for future use
np.save('restaurant_embeddings.npy', restaurant_embeddings)
print("Embeddings saved to restaurant_embeddings.npy")

Generating embeddings for all restaurants...


Batches: 100%|███████████████████████████████████████████████████████████████████████| 144/144 [00:34<00:00,  4.12it/s]

Generated embeddings with shape: (4598, 384)
Embeddings saved to restaurant_embeddings.npy





In [10]:
import chromadb
from chromadb.config import Settings

# Initialize ChromaDB client
client = chromadb.Client(Settings(
    persist_directory="./chroma_db",
    anonymized_telemetry=False
))

# Create or get collection
try:
    collection = client.delete_collection("restaurants")
except:
    pass

collection = client.create_collection(
    name="restaurants",
    metadata={"description": "NYC Restaurant recommendations"}
)

# Add documents to ChromaDB
print("Adding restaurants to vector database...")

documents = df['description'].tolist()
ids = [str(i) for i in range(len(df))]
metadatas = df[['name', 'price', 'rating', 'address']].to_dict('records')

# Convert embeddings to list for ChromaDB
embeddings_list = restaurant_embeddings.tolist()

# Add to collection
collection.add(
    documents=documents,
    embeddings=embeddings_list,
    ids=ids,
    metadatas=metadatas
)

print(f"Added {len(documents)} restaurants to vector database")

Adding restaurants to vector database...
Added 4598 restaurants to vector database


In [24]:
def search_restaurants(query, top_k=5, price_filter=None, min_rating=None):
    """
    Search for restaurants based on natural language query
    
    Args:
        query: Natural language search query
        top_k: Number of results to return
        price_filter: List of price ranges to filter [1,2,3,4]
        min_rating: Minimum rating threshold
    
    Returns:
        List of matching restaurants
    """
    # Generate query embedding
    query_embedding = embedding_model.encode(query).tolist()
    
    # Build where clause for filtering with $and operator
    where_clause = None
    conditions = []
    
    if price_filter:
        conditions.append({"price": {"$in": price_filter}})
    if min_rating:
        conditions.append({"rating": {"$gte": min_rating}})
    
    # Use $and if multiple conditions
    if len(conditions) > 1:
        where_clause = {"$and": conditions}
    elif len(conditions) == 1:
        where_clause = conditions[0]
    
    # Query ChromaDB
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k,
        where=where_clause
    )
    
    # Format results
    recommendations = []
    for i in range(len(results['ids'][0])):
        idx = int(results['ids'][0][i])
        restaurant = df.iloc[idx].to_dict()
        restaurant['similarity_score'] = 1 - results['distances'][0][i]
        recommendations.append(restaurant)
    
    return recommendations

In [25]:
def recommend_restaurants(
    query, 
    top_k=5, 
    max_price=None, 
    min_rating=None,
    cuisine_filter=None
):
    """
    Main recommendation function with all filters
    """
    print(f"\n{'='*60}")
    print(f"Searching for: {query}")
    print(f"{'='*60}\n")
    
    # Apply filters
    price_filter = list(range(1, max_price + 1)) if max_price else None
    
    # Get recommendations
    recommendations = search_restaurants(
        query=query,
        top_k=top_k,
        price_filter=price_filter,
        min_rating=min_rating
    )
    
    # Additional cuisine filtering
    if cuisine_filter:
        recommendations = [
            r for r in recommendations 
            if any(c in r['cuisine'] for c in cuisine_filter)
        ]
    
    # Display results
    if not recommendations:
        print("No restaurants found matching your criteria.")
        return []
    
    for i, restaurant in enumerate(recommendations, 1):
        print(f"{i}. {restaurant['name']}")
        print(f"   {'⭐' * int(restaurant['rating'])} {restaurant['rating']}/5")
        print(f"   Cuisine: {', '.join(restaurant['cuisine'])}")
        print(f"   Price: {'$' * restaurant['price']}")
        print(f"   Address: {restaurant['address']}")
        print(f"   Phone: {restaurant['display_phone']}")
        print(f"   Match Score: {restaurant['similarity_score']:.1%}")
        print()
    
    return recommendations


# Example queries
print("Example 1: Romantic dinner")
recommend_restaurants(
    "romantic Italian restaurant for date night",
    top_k=3,
    max_price=3,
    min_rating=4.0
)

print("\nExample 2: Quick lunch")
recommend_restaurants(
    "quick lunch spot near Times Square with good sandwiches",
    top_k=3,
    max_price=2
)

print("\nExample 3: Special dietary needs")
recommend_restaurants(
    "vegetarian friendly restaurant with healthy options",
    top_k=3,
    min_rating=3.5
)

Example 1: Romantic dinner

Searching for: romantic Italian restaurant for date night

1. Primo Amore Italian Restaurant
   ⭐⭐⭐⭐ 4.5/5
   Cuisine: italian
   Price: $$
   Address: 3433 Francis Lewis Blvd, Flushing, NY 11358-1930
   Phone: +1 718-461-6100
   Match Score: 26.5%

2. Guido's Pasta Villa
   ⭐⭐⭐⭐ 4.5/5
   Cuisine: italian, vegetarian_friendly
   Price: $$
   Address: 1313 E Ridge Rd, Rochester, NY 14621-2004
   Phone: +1 585-266-2676
   Match Score: 25.5%

3. Francesco's Restaurant
   ⭐⭐⭐⭐ 4.5/5
   Cuisine: italian, pizza, vegetarian_friendly
   Price: $$
   Address: 600 Mamaroneck Ave, White Plains, NY 10605-2078
   Phone: +1 914-946-3359
   Match Score: 22.4%


Example 2: Quick lunch

Searching for: quick lunch spot near Times Square with good sandwiches

1. Times Square Diner
   ⭐⭐⭐⭐ 4.0/5
   Cuisine: american, tradamerican, diners, breakfast_brunch
   Price: $$
   Address: 807 8th Ave, Midtown West, NY 10019
   Phone: (212) 315-2400
   Match Score: 14.5%

2. Tiny's Giant

[{'name': 'Plant Food + Wine',
  'price': 2,
  'rating': 4.0,
  'review_count': 96.0,
  'cuisine': ['vegetarian', 'vegan', 'raw_food'],
  'address': '67 2nd Ave, Gramercy, NY 10003',
  'display_phone': '(212) 658-1948',
  'comments': nan,
  'description': 'Restaurant name: Plant Food + Wine. Cuisine types: vegetarian, vegan, raw_food. Price range: Moderate. Rating: 4.0/5 stars. Location: 67 2nd Ave, Gramercy, NY 10003',
  'similarity_score': 0.37322115898132324},
 {'name': 'Grazie',
  'price': 2,
  'rating': 4.5,
  'review_count': 376.0,
  'cuisine': ['italian', 'vegetarian_friendly', 'vegan_options'],
  'address': '26 E 84th St, New York City, NY 10028-0426',
  'display_phone': '+1 212-717-4407',
  'comments': 'I went there with some friends and their baby around Christmas. Our server was clearly upset when we told him we only wanted some drinks, coffees and a couple desserts (basically, not dinner but still, a full order): it took him half an hour to...More',
  'description': 'Restau