# 1. Loading Data

In [1]:
import os
import glob
import json
import pandas as pd
import random
import yaml
from typing import List, Dict

In [58]:
# Read configuration from YAML file
with open("config.yaml") as f:
    config = yaml.safe_load(f)

In [None]:
# Loading Kaggle Dataset
kaggle_reviews_df = pd.read_csv(config["kaggle_reviews_path"])

In [None]:
# Loading Kaggle Dataset (DEPRECATED - to be deleted)
kaggle_reviews_df = pd.read_csv('./data/kaggle_reviews.csv')

In [20]:
kaggle_reviews_df.head()

Unnamed: 0,business_name,author_name,text,photo,rating,rating_category
0,Haci'nin Yeri - Yigit Lokantasi,Gulsum Akar,We went to Marmaris with my wife for a holiday...,dataset/taste/hacinin_yeri_gulsum_akar.png,5,taste
1,Haci'nin Yeri - Yigit Lokantasi,Oguzhan Cetin,During my holiday in Marmaris we ate here to f...,dataset/menu/hacinin_yeri_oguzhan_cetin.png,4,menu
2,Haci'nin Yeri - Yigit Lokantasi,Yasin Kuyu,Prices are very affordable. The menu in the ph...,dataset/outdoor_atmosphere/hacinin_yeri_yasin_...,3,outdoor_atmosphere
3,Haci'nin Yeri - Yigit Lokantasi,Orhan Kapu,Turkey's cheapest artisan restaurant and its f...,dataset/indoor_atmosphere/hacinin_yeri_orhan_k...,5,indoor_atmosphere
4,Haci'nin Yeri - Yigit Lokantasi,Ozgur Sati,I don't know what you will look for in terms o...,dataset/menu/hacinin_yeri_ozgur_sati.png,3,menu


In [21]:
print(kaggle_reviews_df.columns)
print(kaggle_reviews_df.shape)

Index(['business_name', 'author_name', 'text', 'photo', 'rating',
       'rating_category'],
      dtype='object')
(1100, 6)


In [None]:
apify_scraper_df1 = pd.read_csv(config["apify_scraper_path1"])
apify_scraper_df2 = pd.read_csv(config["apify_scraper_path2"])

In [43]:
import random

STATES = [
    "Alabama","Alaska","Arizona","Arkansas","California","Colorado","Connecticut","Delaware",
    "District of Columbia","Florida","Georgia","Hawaii","Idaho","Illinois","Indiana","Iowa",
    "Kansas","Kentucky","Louisiana","Maine","Maryland","Massachusetts","Michigan","Minnesota",
    "Mississippi","Missouri","Montana","Nebraska","Nevada","New Hampshire","New Jersey",
    "New Mexico","New York","North Carolina","North Dakota","Ohio","Oklahoma","Oregon",
    "Pennsylvania","Rhode Island","South Carolina","South Dakota","Tennessee","Texas","Utah",
    "Vermont","Virginia","Washington","West Virginia","Wisconsin","Wyoming"
]

REGION = {
    "Northeast": {"Maine","New Hampshire","Vermont","Massachusetts","Rhode Island","Connecticut",
                  "New York","New Jersey","Pennsylvania"},
    "Midwest": {"Ohio","Michigan","Indiana","Wisconsin","Illinois","Minnesota","Iowa","Missouri",
                "North Dakota","South Dakota","Nebraska","Kansas"},
    "South": {"Delaware","Maryland","District of Columbia","Virginia","West Virginia","North Carolina",
              "South Carolina","Georgia","Florida","Kentucky","Tennessee","Mississippi","Alabama",
              "Oklahoma","Texas","Arkansas","Louisiana"},
    "West": {"Idaho","Montana","Wyoming","Nevada","Utah","Colorado","Arizona","New Mexico",
             "Alaska","Washington","Oregon","California","Hawaii"}
}

random.seed(500)

# Step 1: pick 3 states per region (12 total)
per_region_picks = {}
for rgn, pool in REGION.items():
    pool_list = list(pool & set(STATES))
    per_region_picks[rgn] = random.sample(pool_list, 3)

# Flatten to list of 12
all_12 = [st for r in per_region_picks.values() for st in r]

# Step 2: randomly drop 2 to make 10
all_10 = random.sample(all_12, 10)

# Step 3: shuffle and split into 3 groups
random.shuffle(all_10)
groups = [all_10[i*4:(i+1)*4] for i in range(3)]  # first 2 groups of 4, last group may be shorter

print("Selected 10 states:", all_10)

Selected 10 states: ['Indiana', 'Wisconsin', 'Massachusetts', 'Vermont', 'New Hampshire', 'Nevada', 'Illinois', 'Arkansas', 'Virginia', 'Idaho']


In [None]:
# Change folder and input file individually because 
# running all of them at once hits memory limits

# ---- config ---- (MOVE TO config.yaml)
FOLDER = "./data/google_reviews_US/review-New_York_10.json"   # path to the folder
INPUT_FILE = os.path.join(FOLDER, "review-New_York_10.json")  # the actual file inside
OUTPUT_FILE = "google_reviews_sample_new_york.json"
SAMPLE_SIZE = 1000  # sample about 1000 per state
# ----------------

def read_json_any(path: str) -> pd.DataFrame:
    """Read JSON Lines first, fall back to normal JSON."""
    try:
        return pd.read_json(path, lines=True)
    except ValueError:
        pass
    with open(path, "r", encoding="utf-8") as f:
        obj = json.load(f)
    if isinstance(obj, list):
        return pd.json_normalize(obj)
    elif isinstance(obj, dict):
        return pd.json_normalize(obj)
    return pd.DataFrame()

# 1) Load
df = read_json_any(config["US_input_file"])

# 2) Keep only rows with non-null, non-empty "text"
if "text" in df.columns:
    before = len(df)
    df = df[df["text"].notna()]                                 # drop NaNs
    df = df[df["text"].astype(str).str.strip().ne("")]          # drop empty/whitespace
    after = len(df)
    print(f"Filtered 'text': {before:,} -> {after:,} rows")
else:
    print("⚠️ Warning: 'text' column not found; proceeding without filter.")

# 3) Sample (up to SAMPLE_SIZE)
n = min(SAMPLE_SIZE, len(df))
if n < SAMPLE_SIZE:
    print(f"⚠️ Only {len(df):,} rows available after filtering; sampling {n}.")
df_sample = df.sample(n=n, random_state=42)

# 4) Save as JSON Lines
df_sample.to_json(config["US_output_file"], orient="records", lines=True, force_ascii=False)
print(f"✅ Loaded {len(df):,} usable rows, sampled {len(df_sample):,} → saved to {config["US_output_file"]}")

Filtered 'text': 2,677,684 -> 1,382,508 rows
✅ Loaded 1,382,508 usable rows, sampled 1,000 → saved to ./data/us_reviews/output/google_reviews_sample_iowa.json


In [13]:
file_path = './data/Yelp-JSON/Yelp JSON/yelp_dataset/yelp_academic_dataset_review.json'

N = 10000  # how many reviews you want
sample = []

with open(file_path, 'r', encoding='utf-8') as f:
    for i, line in enumerate(f, start=1):
        if i <= N:
            sample.append(line)
        else:
            # Replace elements with decreasing probability
            j = random.randint(1, i)
            if j <= N:
                sample[j-1] = line

# Parse just the sampled JSON lines
yelp_sample = pd.read_json('\n'.join(sample), lines=True)

print(yelp_sample.shape)
print(yelp_sample.head())

(10000, 9)
                review_id                 user_id             business_id  \
0  -BMXekpibxnJU7UVlNDVLQ  Z57PG6be2-CPNOUJ_BOQGw  QRotJ0k3qj4ecdqNprStxQ   
1  wqUFsDcCZ0r3DryheIUCvg  pOz8G2ezXNRx-yCyRi-0Dg  UiALq7G2d9w1S7fvZEv6TA   
2  cb-Td9FaGSpqE96lOnVeSQ  S9izJAfdGsgBI_AHiw3PHA  l331_6tXs8PSryWql2cOrQ   
3  LQ9AQ-G25duVtv5gy7zDTA  rfDqKDpd1_B-VlkPDfHsqQ  pVwMHUYFMuwmRe6M--ZzwA   
4  MqBca9E0uUA-DOXeL8JvBg  JlnvSC3c6t0gOLizuLs2Bw  mSrXEXee3PX8qjwSuSWlSg   

   stars  useful  funny  cool  \
0      3       0      0     0   
1      4       2      0     0   
2      1       1      0     0   
3      3      10      0     2   
4      1       0      0     0   

                                                text                date  
0  First time going here. The swirl margarita was... 2013-07-14 03:11:55  
1  I have drove past this restaurant many times a... 2020-02-26 17:12:57  
2  STAY AWAY! My friends and I stayed here and we... 2019-02-21 07:12:10  
3  Ok my rating is due to w

  yelp_sample = pd.read_json('\n'.join(sample), lines=True)


In [26]:
yelp_sample.to_json('./data/yelp_sample.json', orient='records', lines=True)

# 2. Preprocessing Data

In [53]:
def drop_empty_text_rows(df, text_col=None):
    """
    Remove rows from DataFrame where the specified text column is NaN or empty/whitespace.
    If text_col is None, drop any rows with any null values.
    Args:
        df (pd.DataFrame): Input DataFrame.
        text_col (str or None): Name of the text column to check, or None to drop any row with nulls.
    Returns:
        pd.DataFrame: Cleaned DataFrame.
    """
    if text_col is None:
        return df.dropna()
    df = df[df[text_col].notna()]  # drop NaNs in text_col
    df = df[df[text_col].astype(str).str.strip().ne("")]  # drop empty/whitespace in text_col
    return df

## 2.1. Kaggle Reviews

In [54]:
# Remove the 'rating_category' column
if 'rating_category' in kaggle_reviews_df.columns:
    kaggle_reviews_df = kaggle_reviews_df.drop(columns=['rating_category'])

kaggle_reviews_df = drop_empty_text_rows(kaggle_reviews_df, text_col="text")

In [55]:
kaggle_reviews_df.shape

(1100, 5)

In [56]:
author_count_df = kaggle_reviews_df['author_name'].value_counts().reset_index()
author_count_df.columns = ['author_name', 'frequency']

In [57]:
print(author_count_df.head())
print(author_count_df.shape)

        author_name  frequency
0   Nihat Karabiber          3
1        Ece Oztunc          3
2        Mustafa Ay          3
3  Saliha Senyildiz          3
4        Seda Seven          2
(1074, 2)


In [32]:
# Save kaggle_reviews_df to a CSV file
kaggle_reviews_df.to_csv(config["kaggle_reviews_cleaned_path"], index=False)
author_count_df.to_csv(config["kaggle_reviews_per_author"], index=False)

## 2.2. US Reviews (above)

## 2.3. Apify Scraper

In [60]:
apify_scraper_df1 = drop_empty_text_rows(apify_scraper_df1, text_col="text")
apify_scraper_df2 = drop_empty_text_rows(apify_scraper_df2, text_col="text")

In [61]:
print(apify_scraper_df1.shape)
print(apify_scraper_df2.shape)

(753, 6)
(377, 100)


In [None]:
apify_scraper_df1.head()

In [63]:
apify_scraper_df2.head()

Unnamed: 0,address,categories/0,categories/1,categories/2,categoryName,cid,city,countryCode,error,fid,...,state,street,temporarilyClosed,text,textTranslated,title,totalScore,translatedLanguage,url,visitedIn
3,"205 Hougang St 21, #01 - 133 / 135, Singapore ...",Cold storage facility,Grocery store,Supermarket,Cold storage facility,3888765206883078885,Singapore,SG,,0x31da17b3f9b9dfb9:0x35f7ab5a232a12e5,...,,"205 Hougang St 21, #01 - 133 / 135",False,Buy food lah,,Cold Storage,4.1,,https://www.google.com/maps/search/?api=1&quer...,
6,"205 Hougang St 21, #01 - 133 / 135, Singapore ...",Cold storage facility,Grocery store,Supermarket,Cold storage facility,3888765206883078885,Singapore,SG,,0x31da17b3f9b9dfb9:0x35f7ab5a232a12e5,...,,"205 Hougang St 21, #01 - 133 / 135",False,Aunty Joe and fat Auntie is serving me good an...,,Cold Storage,4.1,,https://www.google.com/maps/search/?api=1&quer...,
7,"205 Hougang St 21, #01 - 133 / 135, Singapore ...",Cold storage facility,Grocery store,Supermarket,Cold storage facility,3888765206883078885,Singapore,SG,,0x31da17b3f9b9dfb9:0x35f7ab5a232a12e5,...,,"205 Hougang St 21, #01 - 133 / 135",False,Nothing much.. Very small only,,Cold Storage,4.1,,https://www.google.com/maps/search/?api=1&quer...,
9,"205 Hougang St 21, #01 - 133 / 135, Singapore ...",Cold storage facility,Grocery store,Supermarket,Cold storage facility,3888765206883078885,Singapore,SG,,0x31da17b3f9b9dfb9:0x35f7ab5a232a12e5,...,,"205 Hougang St 21, #01 - 133 / 135",False,"Not big, most stuff are there",,Cold Storage,4.1,,https://www.google.com/maps/search/?api=1&quer...,
10,"205 Hougang St 21, #01 - 133 / 135, Singapore ...",Cold storage facility,Grocery store,Supermarket,Cold storage facility,3888765206883078885,Singapore,SG,,0x31da17b3f9b9dfb9:0x35f7ab5a232a12e5,...,,"205 Hougang St 21, #01 - 133 / 135",False,Freshness level totally low. Strawberry at $10...,,Cold Storage,4.1,,https://www.google.com/maps/search/?api=1&quer...,
