In [1]:
# Import the data_cleaning module (make sure this file is in the same directory or on the PYTHONPATH)
import data_cleaning

# Data Cleaning:
chunk_size = 100_000

restaurants_df = data_cleaning.filter_business_data("data/yelp_academic_dataset_business.json", chunk_size)
reviews_df = data_cleaning.filter_review_data("data/yelp_academic_dataset_review.json", restaurants_df, chunk_size)

print(f"Total businesses: {len(restaurants_df)}")
print(f"Total reviews for these restaurants: {len(reviews_df)}")


Total businesses: 27894
Total reviews for these restaurants: 4371282


In [8]:
# 1) Split the existing categories string into lists
restaurants_df['cat_list'] = restaurants_df['categories'].str.split(',\s*', regex=True)

# 2) Explode to one row per individual tag
all_tags = (
    restaurants_df
    .explode('cat_list')['cat_list']
    .dropna()
    .str.strip()
)

# 3) Count raw tags
raw_counts = all_tags.value_counts()
print("=== Raw Cuisine Tags ===")
print(raw_counts.head(20))

# 4) Normalize by removing the generic “Restaurants” tag and sorting
def normalize(tags):
    tags = [t for t in tags if t.lower() != 'restaurants']
    return ', '.join(sorted(tags))

restaurants_df['norm_tags'] = restaurants_df['cat_list'].apply(
    lambda lst: normalize(lst) if isinstance(lst, list) else ''
)

norm_counts = restaurants_df['norm_tags'].value_counts()
print("\n=== Normalized Tag Combinations ===")
print(norm_counts.head(20))


  restaurants_df['cat_list'] = restaurants_df['categories'].str.split(',\s*', regex=True)


=== Raw Cuisine Tags ===
cat_list
Restaurants                  27893
Food                          8229
Nightlife                     6362
Bars                          6145
American (Traditional)        5412
American (New)                4332
Sandwiches                    4188
Breakfast & Brunch            4104
Pizza                         3414
Italian                       2820
Burgers                       2756
Mexican                       2610
Seafood                       2402
Salad                         2127
Coffee & Tea                  2022
Fast Food                     2007
Event Planning & Services     1910
Chinese                       1602
Cafes                         1540
Sushi Bars                    1325
Name: count, dtype: int64

=== Normalized Tag Combinations ===
norm_tags
Mexican                                   790
Pizza                                     567
Chinese                                   488
Italian                                   347
Italian, 

In [5]:
# scoring_tests.py

import numpy as np
import pandas as pd
from scipy.stats import ttest_ind
from statsmodels.stats.proportion import proportions_ztest
import data_cleaning

def clean_food_category(cat: str) -> str:
    if not isinstance(cat, str):
        return 'Other'
    mapping = {
        'Italian': ['Italian'],
        'Mexican': ['Mexican', 'Tacos', 'Tex-Mex']
    }
    for label, kws in mapping.items():
        if any(kw.lower() in cat.lower() for kw in kws):
            return label
    return 'Other'

def main():
    # --- 1) Load filtered data ---
    # chunk_size = 100_000
    # restaurants_df = data_cleaning.filter_business_data(
    #     "data/yelp_academic_dataset_business.json",
    #     chunk_size
    # )
    # reviews_df = data_cleaning.filter_review_data(
    #     "data/yelp_academic_dataset_review.json",
    #     restaurants_df,
    #     chunk_size
    # )
    # print(f"Loaded {len(restaurants_df)} restaurants, {len(reviews_df)} reviews\n")

    # --- 2) Map cuisine for each restaurant ---
    restaurants_df['food_category'] = restaurants_df['categories'].apply(clean_food_category)

    # --- 3) Merge reviews with cuisine labels ---
    merged = reviews_df.merge(
        restaurants_df[['business_id','food_category']],
        on='business_id',
        how='left'
    )

    # --- 4) Two-sample t-test: Italian vs Mexican mean stars ---
    stars_it = merged.loc[merged['food_category']=='Italian', 'stars']
    stars_mx = merged.loc[merged['food_category']=='Mexican', 'stars']

    t_stat, p_val = ttest_ind(stars_it, stars_mx, equal_var=False, nan_policy='omit')
    print("T-test: mean stars Italian vs Mexican")
    print(f"  t = {t_stat:.3f}, p = {p_val:.3f}")
    if p_val < 0.05:
        print("  → Significant difference (α=0.05)\n")
    else:
        print("  → No significant difference (α=0.05)\n")

    # --- 5) Two-proportion z-test: positive rate for top-2 restaurants ---
    # Define positive = stars >= 4
    reviews_df['positive'] = (reviews_df['stars'] >= 4).astype(int)

    # Pick the two restaurants with the most reviews
    top_two = restaurants_df.nlargest(2, 'review_count')['business_id'].tolist()
    A_id, B_id = top_two
    A = reviews_df[reviews_df['business_id']==A_id]
    B = reviews_df[reviews_df['business_id']==B_id]

    count = np.array([A['positive'].sum(), B['positive'].sum()])
    nobs  = np.array([len(A),             len(B)])

    z_stat, p_prop = proportions_ztest(count, nobs)
    print("Z-test: positive-review rate for two busiest restaurants")
    print(f"  A_id={A_id} (n={nobs[0]}), B_id={B_id} (n={nobs[1]})")
    print(f"  z = {z_stat:.3f}, p = {p_prop:.3f}")
    if p_prop < 0.05:
        print("  → Positive-rate differs significantly (α=0.05)")
    else:
        print("  → No significant difference (α=0.05)")

if __name__ == "__main__":
    main()


T-test: mean stars Italian vs Mexican
  t = 16.461, p = 0.000
  → Significant difference (α=0.05)

Z-test: positive-review rate for two busiest restaurants
  A_id=_ab50qdWOk0DdB6XOrBitw (n=7673), B_id=ac1AeYqs8Z4_e2X5M3if2A (n=7516)
  z = -0.107, p = 0.915
  → No significant difference (α=0.05)
