In [None]:
import pandas as pd

In [None]:
import json
data = []
with open('yelp_academic_dataset_business.json', 'r') as file:
    for line in file:
        data.append(json.loads(line))

df = pd.DataFrame(data)

In [None]:
# Following for reading WA state data only, but not useful now

""" 
import json
import pandas as pd

data = []

with open('yelp_academic_dataset_business.json', 'r') as file:
    for line in file:
        business = json.loads(line)
        if business['state'] == 'WA':  # Check if 'State' is 'WA'
            data.append(business)

df = pd.DataFrame(data)
"""

In [None]:
df_sb = df[df["city"] == 'Santa Barbara']

In [None]:
import numpy as np 

# Downtown SB location: 34.4235° N, 119.7035° W

df_sb['distance'] = np.sqrt((df_sb['latitude'] - 34.4235) **2 + (df_sb['longitude']+119.7035) ** 2)


In [None]:
df_sb

In [None]:
import matplotlib.pyplot as plt

df_sb.sort_values(by='review_count')
plt.figure(figsize=(12, 4))
plt.grid(True, linestyle='--')


plt.ylabel('Starts')
plt.xlabel('Distance(km)')
plt.scatter( df_sb['distance'] , df_sb['stars'],  c=  np.log((df_sb['review_count'] / np.max(df_sb['review_count']))), cmap='magma', facecolor='#E1E1E1')
plt.title('Stars vs DIstance to downtown')


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 8))

som = 1 / df_sb['distance']
plt.ylabel('Review count')
plt.xlabel('Distance(Km)')
plt.title('Review count vs Distance to downtown')
plt.grid(True, linestyle='--')

plt.scatter(som, df_sb['review_count'] , c=df_sb['stars'], cmap='magma', s=5, facecolor='E1E1E1')

# Add a color bar to show the mapping of stars to colors
color_bar = plt.colorbar()
color_bar.set_label('Start Values')

plt.show()

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 8))

plt.ylabel('Stars')
plt.xlabel('Review count')
plt.title('Stars vs Review count')
plt.grid(True, linestyle='--')

plt.scatter( df['review_count'], df['stars'], s=5)


plt.show()

In [None]:
def gaussian(x, A, mu, sigma):
    return A * np.exp(-(x - mu)**2 / (2 * sigma**2))


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit

star_counts = df['stars'].value_counts().sort_index()


params, covariance = curve_fit(gaussian, star_counts.index, star_counts.values)
A_fit, mu_fit, sigma_fit = params

x_fit = np.linspace(0, 6, 1000)
y_fit = gaussian(x_fit, A_fit, mu_fit, sigma_fit)

plt.grid(True, linestyle='--',  alpha=0.3)

plt.plot(x_fit, y_fit, 'r-', label='Fitted Gaussian', linewidth=2)
plt.bar(star_counts.index, star_counts.values, tick_label=star_counts.index, width=0.3, align='center')
plt.xlim(0.5, 5.3)
plt.xlabel('Star Rating')
plt.ylabel('Number of Restaurants')
plt.title('Restaurant Star Ratings')
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit

star_counts = df_sb['stars'].value_counts().sort_index()


params, covariance = curve_fit(gaussian, star_counts.index, star_counts.values)
A_fit, mu_fit, sigma_fit = params

x_fit = np.linspace(0, 6, 1000)
y_fit = gaussian(x_fit, A_fit, mu_fit, sigma_fit)

plt.grid(True, linestyle='--',  alpha=0.3)

plt.plot(x_fit, y_fit, 'r-', label='Fitted Gaussian', linewidth=2)
plt.bar(star_counts.index, star_counts.values, tick_label=star_counts.index, width=0.3, align='center')
plt.xlim(0.5, 5.3)
plt.xlabel('Star Rating')
plt.ylabel('Number of Restaurants')
plt.title('Restaurant Star Ratings')
plt.show()


In [None]:
distance_slices = [0,1,2,4,8,16,100]
df_sb['distance_km'] = df_sb['distance'] * 111

# Loop through distance slices
for i in range(len(distance_slices) - 1):
    # Define the current distance range
    min_distance = distance_slices[i]
    max_distance = distance_slices[i + 1]
    
    # Filter the DataFrame for the current distance range
    filtered_df = df_sb[(df_sb['distance_km'] >= min_distance) & (df_sb['distance_km'] < max_distance)]
    
    # Count the number of restaurants for each star rating in the filtered DataFrame
    star_counts = filtered_df['stars'].value_counts().sort_index()
    
    params, covariance = curve_fit(gaussian, star_counts.index, star_counts.values)
    A_fit, mu_fit, sigma_fit = params

    x_fit = np.linspace(0, 6, 1000)
    y_fit = gaussian(x_fit, A_fit, mu_fit, sigma_fit)


    plt.figure(figsize=(8, 6))
    plt.bar(star_counts.index, star_counts.values, tick_label=star_counts.index, width=0.6, align='center')
    plt.plot(x_fit, y_fit, 'r-', label='Fitted Gaussian', linewidth=2)
    plt.xlim(0.5, 5.3)
    plt.xlabel('Star Rating')
    plt.ylabel('Number of Restaurants')
    plt.title(f'Restaurant Star Ratings ({min_distance} to {max_distance} km)')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.text(0, 10, f'mu: {mu_fit:.2f}, sigma: {sigma_fit:.2f}')
    plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit

# Loop through distance slices
for i in range(len(distance_slices) - 1):
    # Define the current distance range
    min_distance = distance_slices[i]
    max_distance = distance_slices[i + 1]
    
    # Filter the DataFrame for the current distance range
    filtered_df = df_sb[(df_sb['distance_km'] >= min_distance) & (df_sb['distance_km'] < max_distance)]
    
    # Histogram of review counts in the filtered DataFrame
    n, bins, _ = plt.hist(filtered_df['review_count'], bins=30, alpha=0.6, label='Histogram')
    mean = np.mean(filtered_df['review_count'])
    
    plt.xlabel('Review Count')
    plt.ylabel('Density')
    plt.title(f'Restaurant Review Counts ({min_distance} to {max_distance} km)')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.text(min(x_fit), max(n)*0.9, f'mu:{mean}')
    plt.legend()
    plt.show()


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 10))

plt.ylabel('Review count')
plt.xlabel('Distance')
plt.title('Review count vs Distance to downtown')

plt.scatter( np.log(df_sb['distance']) , np.log(df_sb['review_count']) , c=df_sb['stars'], cmap='magma', s=5)


# Add a color bar to show the mapping of values to colors
color_bar = plt.colorbar()
color_bar.set_label('Start Values')

plt.show()

In [None]:
import matplotlib.pyplot as plt

df_closer.sort_values(by='review_count')
plt.figure(figsize=(12, 4))
plt.grid(True, linestyle='--')


plt.ylabel('Starts')
plt.xlabel('Distance(km)')
plt.scatter( df_sb['distance'] , df_sb['stars'],  c=  np.log((df_sb['review_count'] / np.max(df_sb['review_count']))), cmap='magma', facecolor='#E1E1E1')
plt.title('Stars vs DIstance to downtown')

In [None]:
distance_slices = [0,1,2,4,8,16,100]
df_sb['distance_km'] = df_sb['distance'] * 111

# Loop through distance slices
for i in range(len(distance_slices) - 1):
    # Define the current distance range
    min_distance = distance_slices[i]
    max_distance = distance_slices[i + 1]
    
    # Filter the DataFrame for the current distance range
    filtered_df = df_sb[(df_sb['distance_km'] >= min_distance) & (df_sb['distance_km'] < max_distance)]
    
    print(np.mean( filtered_df['review_count']))