In [2]:
import pandas as pd
import numpy as np

In [3]:
# Load businesses and filter restaurants
restaurant_keywords = ["restaurant", "food"]
businesses = pd.read_json('yelp_dataset\yelp_academic_dataset_business.json', orient="records", lines=True)
businesses["is_restaurant"] = businesses["categories"].fillna("").str.contains("|".join(restaurant_keywords), case=False, na=False)

In [4]:
# Sample restaurants
total_target = 8000
min_per_region = 100
random_state = 42

frac = total_target / len(businesses)

def stratified_sample(df, frac, min_n):
    return (df.groupby('state', group_keys=False)
                .apply(lambda x: x.sample(
                    n=min(len(x), max(int(len(x)*frac), min_n)),
                    random_state=random_state,
                )))

valid_states = ['CA','MO','AZ','PA','TN','FL','IN','LA','AB','NV','ID','DE','IL','NJ',
                'NC','CO','WA','HI','UT','TX','MT','MI','SD','MA','VI','VT']
restaurants = businesses[businesses["is_restaurant"]]
restaurants = restaurants[restaurants["state"].isin(valid_states)]
restaurant_sample = stratified_sample(restaurants, frac, min_per_region)
restaurant_ids = set(restaurant_sample["business_id"])
restaurant_sample.to_csv("business_sample.csv", index=False)

  .apply(lambda x: x.sample(


In [5]:
# Process reviews in chunks
reviews = pd.read_json('yelp_dataset\yelp_academic_dataset_review.json', orient="records", lines=True, chunksize=50000)

filtered_reviews = []
for i, chunk in enumerate(reviews):
    # Filter only reviews for sampled_restaurants
    matched = chunk[chunk["business_id"].isin(restaurant_ids)]

    # Merge restaurant metadata
    joined = pd.merge(matched, restaurant_sample, on='business_id', how='inner')

    filtered_reviews.append(joined)

    # Stop early if we've processed all ~92k reviews
    if len(pd.concat(filtered_reviews, ignore_index=True)) >= 45000:
        break

# Combine all chunks
final_reviews_df = pd.concat(filtered_reviews)
print(final_reviews_df.shape)
print(final_reviews_df.head())
final_reviews_df.to_csv("restaurant_reviews_sample.csv", index=False)

(46505, 23)
                review_id                 user_id             business_id  \
0  XW_LfMv0fV21l9c6xQd_lw  9OAtfnWag-ajVxRbUTGIyg  lj-E32x9_FA7GmUrBGBEWg   
1  RGV9GWhAAfAAlYyd4vho7g  Zs8Zk3sgh5JxRmoZW4PJcg  3ZynJ94VpIdDlaArmEp2Rg   
2  zqmkEnp1kfU2vosDcG2kMg  KqKXOl0PMlZGBMlw8OUpyA  -If0ps0QhOLCYVWQWs9RYg   
3  bi6GaeWDGceGv62lXTIKQA  RgtbLaiU22zqaCk20HgbiQ  bjhCtlYHrkgA5Ku8l-rB3g   
4  TgDp1TErom3UNglKhDy2uw  zUB7xoTlhbg7_ofHg8Qp0w  W8Z4rXYkmZlEVSaxIVjyvg   

   stars_x  useful  funny  cool  \
0        4       0      0     0   
1        3       0      0     0   
2        5       2      2     2   
3        1       1      0     0   
4        5       1      0     0   

                                                text                date  \
0  Love going here for happy hour or dinner!  Gre... 2014-06-27 22:44:01   
1  A couple friends and I stopped by for some lat... 2013-10-03 16:24:30   
2  Yes! I love this place! Maple Street Patisseri... 2013-05-28 21:37:01   
3  Very di