In [9]:
import sqlite3
import pandas as pd
import json

In [10]:
db_folder = '../../data/processed_data/yelp_data/'

review_conn = sqlite3.connect(db_folder + 'yelp_review_data_new.db')

latest_month = "2022-01"
last_month = "2021-12"
# load review where date is in latest_month and previous month
review_df = pd.read_sql_query(f"SELECT * FROM review_data WHERE date LIKE '{latest_month}%' OR date LIKE '{last_month}%'", review_conn) 

# load business data
business_conn = sqlite3.connect(db_folder + 'yelp_business_data.db')
business_df = pd.read_sql_query("SELECT * FROM business_details", business_conn)

# merge review and business data based on business_id for review_df
df_concat = pd.merge(review_df, business_df, how='left', on='business_id', suffixes=('_business', '_review'))

train_data = df_concat[["business_id", "stars_review"]]

# group by business_id and calculate the sum of stars_review
train_data = train_data.groupby("business_id").agg({"stars_review": "sum"}).reset_index()
# rename columns
train_data.columns = ["business_id", "popularity"]

# sort the popularity in descending order and filter the top 1000 businesses
train_data = train_data.sort_values(by="popularity", ascending=False).head(100).reset_index(drop=True)

# format: array of objects, each object contains business_id and popularity
train_data = train_data.to_dict(orient='records')

# save the result to a JSON file as 'popular_businesses.json'
with open('popular_businesses.json', 'w') as f:
    json.dump(train_data, f, indent=4)