# 1. Loading Dataset

In [2]:
import os
import glob
import json
import pandas as pd
import random
from typing import List, Dict

In [2]:
# Loading Kaggle Dataset
reviews_df = pd.read_csv('./data/kaggle_reviews.csv')

In [3]:
reviews_df.head()

Unnamed: 0,business_name,author_name,text,photo,rating,rating_category
0,Haci'nin Yeri - Yigit Lokantasi,Gulsum Akar,We went to Marmaris with my wife for a holiday...,dataset/taste/hacinin_yeri_gulsum_akar.png,5,taste
1,Haci'nin Yeri - Yigit Lokantasi,Oguzhan Cetin,During my holiday in Marmaris we ate here to f...,dataset/menu/hacinin_yeri_oguzhan_cetin.png,4,menu
2,Haci'nin Yeri - Yigit Lokantasi,Yasin Kuyu,Prices are very affordable. The menu in the ph...,dataset/outdoor_atmosphere/hacinin_yeri_yasin_...,3,outdoor_atmosphere
3,Haci'nin Yeri - Yigit Lokantasi,Orhan Kapu,Turkey's cheapest artisan restaurant and its f...,dataset/indoor_atmosphere/hacinin_yeri_orhan_k...,5,indoor_atmosphere
4,Haci'nin Yeri - Yigit Lokantasi,Ozgur Sati,I don't know what you will look for in terms o...,dataset/menu/hacinin_yeri_ozgur_sati.png,3,menu


In [22]:
import random

STATES = [
    "Alabama","Alaska","Arizona","Arkansas","California","Colorado","Connecticut","Delaware",
    "District of Columbia","Florida","Georgia","Hawaii","Idaho","Illinois","Indiana","Iowa",
    "Kansas","Kentucky","Louisiana","Maine","Maryland","Massachusetts","Michigan","Minnesota",
    "Mississippi","Missouri","Montana","Nebraska","Nevada","New Hampshire","New Jersey",
    "New Mexico","New York","North Carolina","North Dakota","Ohio","Oklahoma","Oregon",
    "Pennsylvania","Rhode Island","South Carolina","South Dakota","Tennessee","Texas","Utah",
    "Vermont","Virginia","Washington","West Virginia","Wisconsin","Wyoming"
]

REGION = {
    "Northeast": {"Maine","New Hampshire","Vermont","Massachusetts","Rhode Island","Connecticut",
                  "New York","New Jersey","Pennsylvania"},
    "Midwest": {"Ohio","Michigan","Indiana","Wisconsin","Illinois","Minnesota","Iowa","Missouri",
                "North Dakota","South Dakota","Nebraska","Kansas"},
    "South": {"Delaware","Maryland","District of Columbia","Virginia","West Virginia","North Carolina",
              "South Carolina","Georgia","Florida","Kentucky","Tennessee","Mississippi","Alabama",
              "Oklahoma","Texas","Arkansas","Louisiana"},
    "West": {"Idaho","Montana","Wyoming","Nevada","Utah","Colorado","Arizona","New Mexico",
             "Alaska","Washington","Oregon","California","Hawaii"}
}

random.seed(500)

# Step 1: pick 3 states per region (12 total)
per_region_picks = {}
for rgn, pool in REGION.items():
    pool_list = list(pool & set(STATES))
    per_region_picks[rgn] = random.sample(pool_list, 3)

# Flatten to list of 12
all_12 = [st for r in per_region_picks.values() for st in r]

# Step 2: randomly drop 2 to make 10
all_10 = random.sample(all_12, 10)

# Step 3: shuffle and split into 3 groups
random.shuffle(all_10)
groups = [all_10[i*4:(i+1)*4] for i in range(3)]  # first 2 groups of 4, last group may be shorter

print("Selected 10 states:", all_10)

Selected 10 states: ['Missouri', 'North Dakota', 'New Jersey', 'New York', 'Rhode Island', 'Oregon', 'Iowa', 'Georgia', 'Virginia', 'Nevada']


In [5]:
# Change folder and input file individually because 
# running all of them at once hits memory limits

# ---- config ----
FOLDER = "./data/google_reviews_US/review-New_York_10.json"   # path to the folder
INPUT_FILE = os.path.join(FOLDER, "review-New_York_10.json")  # the actual file inside
OUTPUT_FILE = "google_reviews_sample_new_york.json"
SAMPLE_SIZE = 1000  # sample about 1000 per state
# ----------------

def read_json_any(path: str) -> pd.DataFrame:
    """Read JSON Lines first, fall back to normal JSON."""
    try:
        return pd.read_json(path, lines=True)
    except ValueError:
        pass
    with open(path, "r", encoding="utf-8") as f:
        obj = json.load(f)
    if isinstance(obj, list):
        return pd.json_normalize(obj)
    elif isinstance(obj, dict):
        return pd.json_normalize(obj)
    return pd.DataFrame()

# 1) Load
df = read_json_any(INPUT_FILE)

# 2) Keep only rows with non-null, non-empty "text"
if "text" in df.columns:
    before = len(df)
    df = df[df["text"].notna()]                                 # drop NaNs
    df = df[df["text"].astype(str).str.strip().ne("")]          # drop empty/whitespace
    after = len(df)
    print(f"Filtered 'text': {before:,} -> {after:,} rows")
else:
    print("⚠️ Warning: 'text' column not found; proceeding without filter.")

# 3) Sample (up to SAMPLE_SIZE)
n = min(SAMPLE_SIZE, len(df))
if n < SAMPLE_SIZE:
    print(f"⚠️ Only {len(df):,} rows available after filtering; sampling {n}.")
df_sample = df.sample(n=n, random_state=42)

# 4) Save as JSON Lines
df_sample.to_json(OUTPUT_FILE, orient="records", lines=True, force_ascii=False)
print(f"✅ Loaded {len(df):,} usable rows, sampled {len(df_sample):,} → saved to {OUTPUT_FILE}")

FileNotFoundError: File ./data/google_reviews_US/review-New_York_10.json\review-New_York_10.json does not exist

In [13]:
file_path = './data/Yelp-JSON/Yelp JSON/yelp_dataset/yelp_academic_dataset_review.json'

N = 10000  # how many reviews you want
sample = []

with open(file_path, 'r', encoding='utf-8') as f:
    for i, line in enumerate(f, start=1):
        if i <= N:
            sample.append(line)
        else:
            # Replace elements with decreasing probability
            j = random.randint(1, i)
            if j <= N:
                sample[j-1] = line

# Parse just the sampled JSON lines
yelp_sample = pd.read_json('\n'.join(sample), lines=True)

print(yelp_sample.shape)
print(yelp_sample.head())

(10000, 9)
                review_id                 user_id             business_id  \
0  -BMXekpibxnJU7UVlNDVLQ  Z57PG6be2-CPNOUJ_BOQGw  QRotJ0k3qj4ecdqNprStxQ   
1  wqUFsDcCZ0r3DryheIUCvg  pOz8G2ezXNRx-yCyRi-0Dg  UiALq7G2d9w1S7fvZEv6TA   
2  cb-Td9FaGSpqE96lOnVeSQ  S9izJAfdGsgBI_AHiw3PHA  l331_6tXs8PSryWql2cOrQ   
3  LQ9AQ-G25duVtv5gy7zDTA  rfDqKDpd1_B-VlkPDfHsqQ  pVwMHUYFMuwmRe6M--ZzwA   
4  MqBca9E0uUA-DOXeL8JvBg  JlnvSC3c6t0gOLizuLs2Bw  mSrXEXee3PX8qjwSuSWlSg   

   stars  useful  funny  cool  \
0      3       0      0     0   
1      4       2      0     0   
2      1       1      0     0   
3      3      10      0     2   
4      1       0      0     0   

                                                text                date  
0  First time going here. The swirl margarita was... 2013-07-14 03:11:55  
1  I have drove past this restaurant many times a... 2020-02-26 17:12:57  
2  STAY AWAY! My friends and I stayed here and we... 2019-02-21 07:12:10  
3  Ok my rating is due to w

  yelp_sample = pd.read_json('\n'.join(sample), lines=True)


In [26]:
yelp_sample.to_json('./data/yelp_sample.json', orient='records', lines=True)

# 2. Load Into Combined Dataframe

### 2.1 Import American States JSON Files

In [49]:
missouri_df = pd.read_json("data/america_states_google/google_reviews_sample_missouri.json", lines = True)
new_jersey_df = pd.read_json("data/america_states_google/google_reviews_sample_new_jersey.json", lines = True)
north_dakota_df = pd.read_json("data/america_states_google/google_reviews_sample_north_dakota.json", lines = True)

In [43]:
missouri_business_df = pd.read_json("data/america_states_google/meta-Missouri.json", lines = True)
new_jersey_business_df = pd.read_json("data/america_states_google/meta-New_Jersey.json", lines = True)
north_dakota_business_df = pd.read_json("data/america_states_google/meta-North_Dakota.json", lines = True)

missouri_business_df.rename(columns={'name': 'place_name'}, inplace=True)
new_jersey_business_df.rename(columns={'name': 'place_name'}, inplace=True)
north_dakota_business_df.rename(columns={'name': 'place_name'}, inplace=True)

In [44]:
missouri_business_df.head()

Unnamed: 0,place_name,address,gmap_id,description,latitude,longitude,category,avg_rating,num_of_reviews,price,hours,MISC,state,relative_results,url
0,McAnulty Barry,"McAnulty Barry, 521 SE 2nd St, Lee's Summit, M...",0x87c11f8fc8dc2853:0x5abc77ad10bc0ebe,,38.915543,-94.368858,[Counselor],4.0,3,,,{'Accessibility': ['Wheelchair accessible entr...,,"[0x87c11f8fe8508ced:0x27e911c66107cafe, 0x87c1...",https://www.google.com/maps/place//data=!4m2!3...
1,Radiant Exterior Lighting,,0x87dec0002c8bd6e3:0xadc5f6e3ec5aca69,,39.253319,-91.418565,"[Landscape lighting designer, Service establis...",5.0,25,,"[[Thursday, 8AM–4PM], [Friday, 8AM–4PM], [Satu...",,Open ⋅ Closes 4PM,"[0x87dece308ab33529:0x83ef05ffe4ce683b, 0x87de...",https://www.google.com/maps/place//data=!4m2!3...
2,Augusta Harmonie Verein (Augusta Heritage Foun...,Augusta Harmonie Verein (Augusta Heritage Foun...,0x87d935395756fbe3:0x9185b746354dc650,,38.575499,-90.876776,[Banquet hall],4.3,7,,,"{'Service options': ['Delivery'], 'Accessibili...",,"[0x87d93500385e8b9f:0x30923934b367835c, 0x87d9...",https://www.google.com/maps/place//data=!4m2!3...
3,Hawke Inc,"Hawke Inc, 5602 Douglas Fir Rd, Joplin, MO 64804",0x87c8650abfed4e0f:0xe349d31dbe05054c,,36.997655,-94.540604,,5.0,1,,,,,,https://www.google.com/maps/place//data=!4m2!3...
4,Spring River Baptist Association,"Spring River Baptist Association, 4037 E 7th S...",0x87c87cf514d85917:0x4967cc8ea885e9f0,,37.083999,-94.455544,[Religious organization],5.0,2,,,{'Accessibility': ['Wheelchair accessible entr...,,"[0x87c8f59ee322c40b:0x4b093e8d4e860b1a, 0x87c8...",https://www.google.com/maps/place//data=!4m2!3...


In [50]:
missouri_df['time'] = pd.to_datetime(missouri_df['time'], unit='ms')
missouri_df = missouri_df.drop(
    columns=['user_id']
)
missouri_df = missouri_df.merge(
    missouri_business_df[['gmap_id', 'place_name']],
    on='gmap_id',
    how='left'
)

missouri_df.head()

Unnamed: 0,name,time,rating,text,pics,resp,gmap_id,place_name
0,Sarah Aulbach,2021-05-29 00:34:30.692,5,"The store was clean and organized, and the cas...",,,0x87cf039ca8d9d4bf:0xd2eec4493658ac07,Bass Pro Shops
1,Ericka Woodall,2020-02-15 18:06:33.519,5,"Great food, good service, great atmosphere.",,"{'time': 1582314411656, 'text': 'Hi Ericka, we...",0x87c11cf09781cc17:0xcebc7a3d1993386e,Hooters
2,Roseanna Still,2019-04-19 00:18:39.968,5,Love going to Dollar Tree! Everything is a dol...,,,0x87dd2d3108c73bf7:0xe6690b29f103d155,Dollar Tree
3,William Ward,2019-04-27 19:32:05.516,5,Great selection,,,0x87c0efce9b944fb7:0x69b44258d077de4b,Half Price Books
4,Susanna Allen,2019-12-09 23:01:03.539,3,Great customer service,,,0x87c0fa0239e70477:0x4fe51b9b1426f901,McDonald's


### 2.2 Import Crawled Data

In [54]:
healthcare_df = pd.read_csv("data/singapore_healthcare.csv")
healthcare_reviews_df = pd.read_csv("data/singapore_healthcare_detailed_reviews.csv")

hospitality_df = pd.read_csv("data/singapore_hotels.csv")
hospitality_reviews_df = pd.read_csv("data/singapore_hotels_detailed_reviews.csv")

In [None]:
healthcare_reviews_df = healthcare_reviews_df.drop(
    columns=['place_id', 'review_link', 'review_id', 'reviewer_profile', 'experience_details',
             'published_at', 'review_translated_text', 'response_from_owner_translated_text']
)

healthcare_reviews_df['published_at_date'] = pd.to_datetime(healthcare_reviews_df['published_at_date'])

healthcare_reviews_df.head()

Unnamed: 0,place_name,name,reviewer_id,rating,review_text,published_at_date,response_from_owner_text,response_from_owner_ago,response_from_owner_date,total_number_of_reviews_by_reviewer,total_number_of_photos_by_reviewer,is_local_guide,review_photos
0,LEGACY CLINIC,Two Itchy Feet,101291557305430025101,5,You'll never feel ill when you feel at home --...,2025-08-13 04:07:53,,,,20,42,1.0,"[{""id"":""CIABIhBsuIMlLnIdwu786Zpe3aRG"",""url"":""h..."
1,LEGACY CLINIC,Stephen Fong,117979449519326415761,5,Dr Kavina was courteous and approachable as I ...,2025-08-05 06:31:49,much appreciate your lovely review.. We striv...,3 weeks ago,2025-08-05T06:43:34,1,0,,[]
2,LEGACY CLINIC,Dawn Santa Maria,115438103688989236629,5,Dr Kaur has great expertise and has devised ma...,2025-08-15 11:38:34,,,,8,1,,[]
3,LEGACY CLINIC,Kandi leong,107393619057443653334,5,"Appreciate the tranquility, trusting patient d...",2025-03-26 02:22:03,we wish u a speedy recovery..\n\nMuch appreci...,5 months ago,2025-03-26T04:43:22,3,1,,"[{""id"":""CIHM0ogKEICAgMDwnt3TMw"",""url"":""https:/..."
4,LEGACY CLINIC,Esther Lim,100406321881476331229,5,"Impressed with Dr Kavina' consultation, she's ...",2025-08-12 08:54:54,,,,2,1,,[]


In [20]:
hospitality_reviews_df = hospitality_reviews_df.drop(
    columns=['place_id', 'review_link', 'review_id', 'reviewer_profile', 'experience_details',
             'published_at', 'review_translated_text', 'response_from_owner_translated_text']
)

hospitality_reviews_df.head()

Unnamed: 0,place_name,name,reviewer_id,rating,review_text,published_at_date,response_from_owner_text,response_from_owner_ago,response_from_owner_date,total_number_of_reviews_by_reviewer,total_number_of_photos_by_reviewer,is_local_guide,review_photos
0,"Santa Grand Hotel East Coast, a NuVe Group Col...",Jacob Thomas,102337229616278159974,1.0,We booked this hotel back in March for the fin...,2025-07-13T08:13:51,Thank you for taking the time to share your fe...,a month ago,2025-07-16T09:31:34,24,8,1.0,[]
1,"Santa Grand Hotel East Coast, a NuVe Group Col...",ChanT16,,,"Good location from the airport, about a 15min ...",2025-05-30T23:59:59,,,,0,0,,[]
2,"Santa Grand Hotel East Coast, a NuVe Group Col...",Mullaway On The Beach,118265564388883100797,5.0,Compact rooms but clean and with everything yo...,2025-05-29T20:37:26,Thank you for your kind words! We are thrilled...,2 months ago,2025-06-12T09:03:07,46,11,1.0,[]
3,"Santa Grand Hotel East Coast, a NuVe Group Col...",Carey Wright,107417069074182398855,5.0,We love this hotel! The location is great with...,2025-05-21T08:30:39,Thank you for your kind words! We are thrilled...,3 months ago,2025-05-22T06:32:37,2,9,,"[{""id"":""CIHM0ogKENGbmKv8gJ61Tw"",""url"":""https:/..."
4,"Santa Grand Hotel East Coast, a NuVe Group Col...",JX,116769873905008439961,1.0,My recent stay was a complete disappointment a...,2025-05-04T06:41:33,Thank you for taking the time to share your re...,3 months ago,2025-05-06T01:39:36,327,1476,1.0,"[{""id"":""CIHM0ogKEICAgMCYtuu23gE"",""url"":""https:..."
