In [2]:
import pandas as pd
import json

# Replace these with your actual file paths
business_file = 'yelp_academic_dataset_business.json'
review_file = 'yelp_academic_dataset_review.json'
user_file = 'yelp_academic_dataset_user.json'

# Function to load JSON lines into a DataFrame
def load_json_lines(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]
    return pd.DataFrame(data)

# Load the datasets
business_df = load_json_lines(business_file)
review_df = load_json_lines(review_file)
user_df = load_json_lines(user_file)

# Display basic info
print("Business DataFrame:", business_df.shape)
print("Review DataFrame:", review_df.shape)
print("User DataFrame:", user_df.shape)


Business DataFrame: (150346, 14)
Review DataFrame: (6990280, 9)
User DataFrame: (1987897, 22)


In [7]:
# Save each DataFrame for faster access later
business_df.to_parquet("business_df.parquet")
review_df.to_parquet("review_df.parquet")
user_df.to_parquet("user_df.parquet")

In [3]:
import pandas as pd

# Load previously saved DataFrames
business_df = pd.read_parquet("../Data/philly_businesses.parquet")
review_df = pd.read_parquet("../Data/philly_reviews.parquet")
user_df = pd.read_parquet("../Data/philly_users.parquet")

In [12]:
# Step 1: Filter business_df
filtered_business_df = business_df[
    (business_df['stars'] > 4.0) &
    (business_df['review_count'] >= 5) &
    (business_df['categories'].str.contains('Restaurants', na=False))
]

# Step 2: Filter review_df to reviews for those businesses
filtered_review_df = review_df[review_df['business_id'].isin(filtered_business_df['business_id'])]

# Step 3: Filter user_df to only users who wrote those reviews AND have at least 5 reviews
filtered_user_df = user_df[
    (user_df['user_id'].isin(filtered_review_df['user_id'])) &
    (user_df['review_count'] >= 10)
][['user_id', 'friends']]  # Only keep user_id and friends

# Step 4: Save to Parquet with compression
filtered_business_df.to_parquet("filtered_restaurants_businesses.parquet", index=False, compression='snappy')
filtered_review_df.to_parquet("filtered_restaurants_reviews.parquet", index=False, compression='snappy')
filtered_user_df.to_parquet("filtered_restaurants_users.parquet", index=False, compression='snappy')


In [13]:
# Counts
restaurant_count = len(filtered_business_df)
user_count = len(filtered_user_df)
review_count = len(filtered_review_df)

# Avg users per restaurant (unique users per business)
users_per_restaurant = (
    filtered_review_df.groupby('business_id')['user_id']
    .nunique()
    .mean()
)

# Avg reviews per restaurant
reviews_per_restaurant = (
    filtered_review_df.groupby('business_id')['review_id']
    .count()
    .mean()
)

# Print the results
print(f"Number of restaurants: {restaurant_count}")
print(f"Number of users: {user_count}")
print(f"Number of reviews: {review_count}")
print(f"Average users per restaurant: {users_per_restaurant:.2f}")
print(f"Average reviews per restaurant: {reviews_per_restaurant:.2f}")


Number of restaurants: 1137
Number of users: 43101
Number of reviews: 140165
Average users per restaurant: 119.83
Average reviews per restaurant: 123.28


In [8]:
filtered_user_df.head()

Unnamed: 0,user_id,friends
0,qVc8ODYU5SZjKXVBgXdI7w,"NSCy54eWehBJyZdG2iE84w, pe42u7DcCH2QmI81NX-8qA..."
1,j14WgRoU_-2ZE1aw1dXrJg,"ueRPE0CX75ePGMqOFVj6IQ, 52oH4DrRvzzl8wh5UXyU0A..."
4,1McG5Rn_UDkmlkZOrsdptg,"piejMEdRkGB7-1aL4lL5NQ, X0zFOU6iG95-feQKOXkgrA..."
7,NIhcRW6DWvk1JQhDhXwgOQ,"T1upaPMzuW7pNj74fO1rjA, CP28puvAEimt4ziuGTDaHA..."
10,QJI9OSEn6ujRCtrX06vs1w,"RyPeT_ICAtX8ah9dhDpEFw, W8r4aKPZFT3GPIQQDbqB6Q..."


In [9]:
# Step 1: Filter original user_df for review_count >= 5
filtered_user_df = user_df[user_df['review_count'] >= 5]

# Step 2: Keep only user_id and friends
filtered_user_df = filtered_user_df[['user_id', 'friends']]

# Step 3: Save to Parquet with compression
filtered_user_df.to_parquet("filtered_restaurants_users.parquet", index=False, compression='snappy')


In [7]:
# Keep only user_id and friends
filtered_user_df = filtered_user_df[['user_id', 'friends']]

# Save with compression to reduce file size
filtered_user_df.to_parquet("filtered_restaurants_users.parquet", index=False, compression='snappy')
