In [None]:
##1. plotting all the restos on a map

In [None]:
!pip install pandas folium

In [None]:
import pandas as pd
import os

df = pd.read_json('data/yelp_academic_dataset_restaurants.json', lines=True)

# Display the first 5 rows of the DataFrame
print(df.head())


In [None]:
# check for null values in lat and long
print("Checking for null values before cleaning:")
print(df[['latitude', 'longitude']].isnull().sum())

# remove rows with null values. create a new DataFrame that drops rows where either 'latitude' or 'longitude' is missing.
df_cleaned = df.dropna(subset=['latitude', 'longitude'])

# cleaning verifcation, how many rows removed
print(f"\nOriginal DataFrame shape: {df.shape}")
print(f"Cleaned DataFrame shape:  {df_cleaned.shape}")


In [None]:
import folium

# find center of the map
map_center = [df['latitude'].mean(), df['longitude'].mean()]

# Create a map object
m = folium.Map(location=map_center, zoom_start=11)

# Display the map 
m

In [None]:
# Create a new map object
m_points = folium.Map(location=map_center, zoom_start=11)

# Add a marker for each restaurant
for idx, row in df.head(60000).iterrows():
    folium.CircleMarker(
        location=[row['latitude'], row['longitude']],
        radius=3,
        color='#3186cc',
        fill=True,
        fill_color='#3186cc',
        # Add a popup with the restaurant's name
        popup=row['name']
    ).add_to(m_points)

# Display the map
m_points

In [None]:
## Create heatmap
from folium.plugins import HeatMap

# Create a list of [latitude, longitude] pairs
heat_data = df[['latitude', 'longitude']].values.tolist()

# Create a new map
m_heatmap = folium.Map(location=map_center, zoom_start=11)

# Add the heatmap layer
HeatMap(heat_data).add_to(m_heatmap)

# Display the map
m_heatmap

i find this heatmapping a bit hard to read, but i think its a good start?

trying market clustering, groups nearby points into a single, numbered circle. zooming in, clusters will break apart into smaller clusters or individual points

In [None]:
import folium
from sklearn.cluster import KMeans
from colorsys import hsv_to_rgb
import numpy as np

# perform clustering
# Select the latitude and longitude columns for clustering
coords = df_cleaned[['latitude', 'longitude']].values

# Define the number of clusters you want to find
n_clusters = 10 

# Create and run the KMeans model to generate the 'cluster' column
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto')
df_cleaned['cluster'] = kmeans.fit_predict(coords)

print("Clustering complete!")


# visualise clusters
# Create a new map
map_clusters = folium.Map(location=[df_cleaned['latitude'].mean(), df_cleaned['longitude'].mean()], zoom_start=11)

# Create a color palette for the clusters 
cluster_colors = [f"#{int(r*255):02x}{int(g*255):02x}{int(b*255):02x}" for r,g,b in [hsv_to_rgb(x/n_clusters, 1., 1.) for x in range(n_clusters)]]

# Loop through and plot each point with its cluster color
for idx, row in df_cleaned.iterrows():
    # This loop needs the `df_cleaned['cluster']` column from Step 1
    color = cluster_colors[row['cluster']]
    folium.CircleMarker(
        location=[row['latitude'], row['longitude']],
        radius=4,
        popup=f"Cluster {row['cluster']}",
        color=color,
        fill=True,
        fill_color=color
    ).add_to(map_clusters)

# Display the final map with clusters
map_clusters

as the search was too wide, i narrowed it down further to only consider "quality" restaurants - high stars, and substantial review count

In [None]:
###exploring narrowing it down further to quality restos

# Filter for restaurants with 4.5 stars or more and a sufficient reviews
df_top_rated = df_cleaned[(df_cleaned['stars'] >= 4.5) & (df_cleaned['review_count'] > 50)]
print(f"Found {len(df_top_rated)} top-rated restaurants.")

# Create a new map for the top-rated spots
m_top_rated = folium.Map(location=map_center, zoom_start=11)

for idx, row in df_top_rated.iterrows():
    folium.Marker(
        location=[row['latitude'], row['longitude']],
        popup=f"{row['name']} ({row['stars']} stars)",
        icon=folium.Icon(color='green', icon='star') # Add a green star icon
    ).add_to(m_top_rated)

# Display the map
m_top_rated

Further analysis: Exploratory Data Analysis on which city has the most restaurants?

In [None]:
!pip install matplotlib seaborn

In [None]:
import pandas as pd

# split the 'categories' string into a list of categories
df_cleaned['category_list'] = df_cleaned['categories'].str.split(', ')

# 2. create a new row for each category in the list.
df_exploded = df_cleaned.explode('category_list')

print("Data after exploding categories:")
print(df_exploded[['name', 'category_list']].head())

In [None]:
# Get a list of unique cities in the dataset
print("Cities available in the dataset:")
print(df_cleaned['city'].unique())

In [None]:
## EDA on top cities with most no. of businesses on yelp 
import matplotlib.pyplot as plt
import seaborn as sns

# Get count of businesses for each city and select the top 20
top_20_cities = df_cleaned['city'].value_counts().head(20)

print("Top 20 Cities by Number of Businesses:")
print(top_20_cities)

# Create a bar plot to visualize the results
plt.figure(figsize=(12, 10)) # Adjust figure size for better readability
sns.barplot(x=top_20_cities.values, y=top_20_cities.index, palette='rocket')

plt.title('Top 20 Cities by Number of Businesses', fontsize=16)
plt.xlabel('Number of Businesses', fontsize=12)
plt.ylabel('City', fontsize=12)
plt.xticks(rotation=45) # Rotate x-axis labels slightly if they overlap
plt.tight_layout() # Adjust plot to ensure everything fits without overlapping

plt.show()

EDA: Which cuisine is popular in which town? (only looking at top 10 cities)

In [None]:
## define names for clusters, mapping cluster ID number to its descriptive name
import matplotlib.pyplot as plt
import seaborn as sns

cluster_names = {
    0: 'Fast Food',
    1: 'Asian / Chinese Restaurants',
    2: 'Non-Italian Pizzeria',
    3: 'Cafe/Deli',
    4: 'North American Restaurants',
    5: 'Seafood Restaurants',
    6: 'Food Trucks',
    7: 'Breakfast & Brunch Places',
    8: 'Italian Pizzeria / Restaurants',
    9: 'Sushi Bars / Mexican',
    10: 'Caterers & Event Companies',
    11: 'Bars / Nightlife Restaurants'
}

In [None]:
## generating category cluster column in current dataset
import pandas as pd

# Define the signature categories for each cluster 
cluster_signatures = {
    0: ['Restaurants', 'Fast Food', 'Burgers', 'Sandwiches', 'Food'],
    1: ['Restaurants', 'Chinese', 'Asian Fusion', 'Japanese', 'Food'],
    2: ['Pizza', 'Restaurants', 'Sandwiches', 'Food', 'Chicken Wings'],
    3: ['Food', 'Restaurants', 'Coffee & Tea', 'Sandwiches', 'Breakfast & Brunch'],
    4: ['Restaurants', 'Mexican', 'Sandwiches', 'American (New)', 'American (Traditional)'],
    5: ['Seafood', 'Restaurants', 'American (Traditional)', 'Steakhouses', 'Cajun/Creole'],
    6: ['Restaurants', 'Food', 'Food Trucks', 'Mexican', 'Event Planning & Services'],
    7: ['American (Traditional)', 'Restaurants', 'Breakfast & Brunch', 'American (New)', 'Food'],
    8: ['Italian', 'Pizza', 'Restaurants', 'Sandwiches', 'Food'],
    9: ['Mexican', 'Restaurants', 'Sushi Bars', 'Bars', 'Nightlife'],
    10: ['Restaurants', 'Caterers', 'Event Planning & Services', 'Food', 'Sandwiches'],
    11: ['Restaurants', 'Nightlife', 'Bars', 'American (Traditional)', 'American (New)']
}

# create the function to find the best cluster for a restaurant 
def assign_best_cluster(restaurant_categories):
    """
    Calculates a match score for a restaurant against all 12 clusters
    and returns the ID of the best-matching cluster.
    """
    best_score = -1
    best_cluster_id = -1 # Default for no match

    # Ensure the input is a list
    if not isinstance(restaurant_categories, list):
        return best_cluster_id

    for cluster_id, signature_cats in cluster_signatures.items():
        # Calculate score by counting matching categories
        score = len(set(restaurant_categories) & set(signature_cats))

        if score > best_score:
            best_score = score
            best_cluster_id = cluster_id
            
    return best_cluster_id

# apply the function to create the 'category_cluster' column

# ensure the 'category_list' column exists
if 'category_list' not in df_cleaned.columns:
    df_cleaned['category_list'] = df_cleaned['categories'].str.split(', ')

print("Assigning each restaurant to a category cluster. This may take a moment...")

# Apply the function to each row
df_cleaned['category_cluster'] = df_cleaned['category_list'].apply(assign_best_cluster)

print("Done! The 'category_cluster' column has been created.")

# verify
print("\nHere is the distribution of restaurants across the new clusters:")
print(df_cleaned['category_cluster'].value_counts().sort_index())


In [None]:
## Analyze and Plot Top Clusters for the Top 10 States

# get the list of the top 10 states by business count
top_10_states = df_cleaned['state'].value_counts().head(10).index.tolist()

print(f"--- Analyzing Top Restaurant Types for Top 10 States: {top_10_states} ---")

# loop through each state and generate a plot
for state in top_10_states:
    # Filter the DataFrame for the current state
    state_df = df_cleaned[df_cleaned['state'] == state]

    # Count the occurrences of each cluster in that state
    # We'll look at the top 5 most common clusters
    top_clusters_in_state = state_df['category_cluster'].value_counts().head(5)

    # Use the mapping to replace the cluster numbers with their descriptive names
    top_clusters_in_state.index = top_clusters_in_state.index.map(cluster_names)

    # Create the bar plot
    plt.figure(figsize=(10, 7))
    sns.barplot(x=top_clusters_in_state.values, y=top_clusters_in_state.index, palette='magma')

    plt.title(f'Top 5 Restaurant Types in {state}', fontsize=16)
    plt.xlabel('Number of Businesses', fontsize=12)
    plt.ylabel('Restaurant Type (Cluster)', fontsize=12)
    plt.tight_layout()
    plt.show()


In [None]:
## for easier readability, also created a stacked bar chart 

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# re-create the DataFrame and Clusters 
print("Loading and preparing data...")
df = pd.read_json('data/yelp_academic_dataset_restaurants.json', lines=True)
df_cleaned = df.dropna(subset=['latitude', 'longitude']).copy()

# Define cluster signatures
cluster_signatures = {
    0: ['Restaurants', 'Fast Food', 'Burgers', 'Sandwiches', 'Food'],
    1: ['Restaurants', 'Chinese', 'Asian Fusion', 'Japanese', 'Food'],
    2: ['Pizza', 'Restaurants', 'Sandwiches', 'Food', 'Chicken Wings'],
    3: ['Food', 'Restaurants', 'Coffee & Tea', 'Sandwiches', 'Breakfast & Brunch'],
    4: ['Restaurants', 'Mexican', 'Sandwiches', 'American (New)', 'American (Traditional)'],
    5: ['Seafood', 'Restaurants', 'American (Traditional)', 'Steakhouses', 'Cajun/Creole'],
    6: ['Restaurants', 'Food', 'Food Trucks', 'Mexican', 'Event Planning & Services'],
    7: ['American (Traditional)', 'Restaurants', 'Breakfast & Brunch', 'American (New)', 'Food'],
    8: ['Italian', 'Pizza', 'Restaurants', 'Sandwiches', 'Food'],
    9: ['Mexican', 'Restaurants', 'Sushi Bars', 'Bars', 'Nightlife'],
    10: ['Restaurants', 'Caterers', 'Event Planning & Services', 'Food', 'Sandwiches'],
    11: ['Restaurants', 'Nightlife', 'Bars', 'American (Traditional)', 'American (New)']
}

# Define the assignment function
def assign_best_cluster(restaurant_categories):
    best_score = 0
    best_cluster_id = -1
    if not isinstance(restaurant_categories, list): return best_cluster_id
    for cluster_id, signature_cats in cluster_signatures.items():
        score = len(set(restaurant_categories) & set(signature_cats))
        if score > best_score:
            best_score = score
            best_cluster_id = cluster_id
    return best_cluster_id

# Create the category columns
df_cleaned['category_list'] = df_cleaned['categories'].str.split(', ')
df_cleaned['category_cluster'] = df_cleaned['category_list'].apply(assign_best_cluster)
print("Data ready for visualization.")

# prepare Data for the Chart 
# Define the cluster names for the chart labels
cluster_names = {
    0: 'Fast Food', 1: 'Asian/Chinese', 2: 'Pizzeria (Non-Italian)', 3: 'Cafe/Deli',
    4: 'North American', 5: 'Seafood/Steakhouse', 6: 'Food Truck', 7: 'Breakfast/Brunch',
    8: 'Italian/Pizzeria', 9: 'Sushi/Mexican (Bars)', 10: 'Caterers', 11: 'Bars/Nightlife'
}

# Get the top 10 states
top_10_states = df_cleaned['state'].value_counts().head(10).index.tolist()
df_top_states = df_cleaned[df_cleaned['state'].isin(top_10_states)]

# Create a cross-tabulation to count clusters per state
heatmap_data = pd.crosstab(df_top_states['state'], df_top_states['category_cluster'])

# Normalize the data to get proportions (percentages)
chart_data = heatmap_data.div(heatmap_data.sum(axis=1), axis=0)

# Rename the columns from numbers to descriptive names
chart_data.columns = chart_data.columns.map(cluster_names)


# Create Stacked Bar Chart
print("\nGenerating stacked bar chart...")
ax = chart_data.plot(
    kind='barh',        # Horizontal bar chart
    stacked=True,
    figsize=(14, 10),
    colormap='tab20c',  # A color map with many distinct colors
    edgecolor='white'
)

# Add percentage labels 
for container in ax.containers:
    # Format the labels as percentages.
    # We only add a label if the segment width is > 4% (0.04) to avoid clutter.
    labels = [f'{w:.1%}' if (w := v.get_width()) > 0.04 else '' for v in container]
    
    ax.bar_label(
        container,
        labels=labels,
        label_type='center', # Position the label in the center of the segment
        color='black',       # Label color
        fontsize=9,
        fontweight='bold'
    )

# Formatting the Chart
ax.set_title('Proportion of Restaurant Types by State', fontsize=20)
ax.set_xlabel('Percentage', fontsize=14)
ax.set_ylabel('State', fontsize=14)

# Format x-axis labels as percentages
ax.xaxis.set_major_formatter(plt.FuncFormatter('{:.0%}'.format))

# Move the legend outside of the plot so it doesn't cover the data
ax.legend(
    loc='center left',
    bbox_to_anchor=(1, 0.5),
    title="Restaurant Type",
    fontsize='medium'
)

plt.tight_layout(rect=[0, 0, 0.85, 1]) # Adjust layout to make space for the legend
plt.show()

In [None]:
## 2. DOES LOCATION AFFECT PRICE??


In [None]:
import pandas as pd
import json
import os

# --- Step 1: Check if the file exists and provide the correct path ---
file_path = 'yelp_food_business_flat.json'  # Update this with the correct path if needed

# Check if the file exists before trying to open it
if not os.path.exists(file_path):
    print(f"Error: The file '{file_path}' does not exist.")
    print(f"Current working directory: {os.getcwd()}")
    print("Please check the file path or upload the file to this directory.")
else:
    # --- Step 2: Load the raw JSON data from the file ---
    with open(file_path, 'r') as f:
        data = json.load(f)

    # --- Step 3: Use json_normalize to flatten the data correctly ---
    df_normalized = pd.json_normalize(data)

    print("Successfully loaded and flattened the data!")
    print("Here are the available columns now:")
    print(df_normalized.columns.tolist())

    # --- Step 4: Clean the data (remove null lat/lon) ---
    df_cleaned = df_normalized.dropna(subset=['latitude', 'longitude'])