# 1. Loading Dataset

In [2]:
import os
import glob
import json
import pandas as pd
import random
from typing import List, Dict

In [2]:
# Loading Kaggle Dataset
reviews_df = pd.read_csv('./data/kaggle_reviews.csv')

In [3]:
reviews_df.head()

Unnamed: 0,business_name,author_name,text,photo,rating,rating_category
0,Haci'nin Yeri - Yigit Lokantasi,Gulsum Akar,We went to Marmaris with my wife for a holiday...,dataset/taste/hacinin_yeri_gulsum_akar.png,5,taste
1,Haci'nin Yeri - Yigit Lokantasi,Oguzhan Cetin,During my holiday in Marmaris we ate here to f...,dataset/menu/hacinin_yeri_oguzhan_cetin.png,4,menu
2,Haci'nin Yeri - Yigit Lokantasi,Yasin Kuyu,Prices are very affordable. The menu in the ph...,dataset/outdoor_atmosphere/hacinin_yeri_yasin_...,3,outdoor_atmosphere
3,Haci'nin Yeri - Yigit Lokantasi,Orhan Kapu,Turkey's cheapest artisan restaurant and its f...,dataset/indoor_atmosphere/hacinin_yeri_orhan_k...,5,indoor_atmosphere
4,Haci'nin Yeri - Yigit Lokantasi,Ozgur Sati,I don't know what you will look for in terms o...,dataset/menu/hacinin_yeri_ozgur_sati.png,3,menu


In [5]:
# Loading Google Reviews Dataset

# 50 states + DC (if you don't want DC, remove it)
STATES = [
    "Alabama","Alaska","Arizona","Arkansas","California","Colorado","Connecticut","Delaware",
    "District of Columbia","Florida","Georgia","Hawaii","Idaho","Illinois","Indiana","Iowa",
    "Kansas","Kentucky","Louisiana","Maine","Maryland","Massachusetts","Michigan","Minnesota",
    "Mississippi","Missouri","Montana","Nebraska","Nevada","New Hampshire","New Jersey",
    "New Mexico","New York","North Carolina","North Dakota","Ohio","Oklahoma","Oregon",
    "Pennsylvania","Rhode Island","South Carolina","South Dakota","Tennessee","Texas","Utah",
    "Vermont","Virginia","Washington","West Virginia","Wisconsin","Wyoming"
]

# Optional: region map for stratified selection (helps diversity)
REGION = {
    "Northeast": {"Maine","New Hampshire","Vermont","Massachusetts","Rhode Island","Connecticut",
                  "New York","New Jersey","Pennsylvania"},
    "Midwest": {"Ohio","Michigan","Indiana","Wisconsin","Illinois","Minnesota","Iowa","Missouri",
                "North Dakota","South Dakota","Nebraska","Kansas"},
    "South": {"Delaware","Maryland","District of Columbia","Virginia","West Virginia","North Carolina",
              "South Carolina","Georgia","Florida","Kentucky","Tennessee","Mississippi","Alabama",
              "Oklahoma","Texas","Arkansas","Louisiana"},
    "West": {"Idaho","Montana","Wyoming","Nevada","Utah","Colorado","Arizona","New Mexico",
             "Alaska","Washington","Oregon","California","Hawaii"}
}

def pick_states(states: List[str], k: int = 8, seed: int = 42, stratify: bool = True) -> List[str]:
    """
    Picks k states. If stratify=True, tries to spread picks across regions.
    """
    random.seed(seed)
    if not stratify or k < 4:
        return random.sample(states, k)

    # Stratify: pick as evenly as possible across regions
    regions = list(REGION.keys())
    picks_per_region = [k // 4] * 4
    for i in range(k % 4):
        picks_per_region[i] += 1

    selected = []
    for r, n in zip(regions, picks_per_region):
        pool = list(REGION[r] & set(states))
        n = min(n, len(pool))
        selected.extend(random.sample(pool, n))
    return selected

In [6]:
print("Selected States:", pick_states(STATES, k=8, stratify=True))

Selected States: ['Rhode Island', 'New York', 'Ohio', 'Indiana', 'Georgia', 'Texas', 'Hawaii', 'Idaho']


In [7]:
# Change folder and input file individually because 
# running all of them at once hits memory limits

# ---- config ----
FOLDER = "./data/google_reviews/review-Idaho_10.json"   # path to the folder
INPUT_FILE = os.path.join(FOLDER, "review-Idaho_10.json")  # the actual file inside
OUTPUT_FILE = "google_reviews_sample_idaho.json"
SAMPLE_SIZE = 150
# ----------------

def read_json_any(path: str) -> pd.DataFrame:
    """Read JSON Lines Exfirst, fall back to normal JSON."""
    try:
        return pd.read_json(path, lines=True)
    except ValueError:
        pass
    with open(path, "r", encoding="utf-8") as f:
        obj = json.load(f)
    if isinstance(obj, list):
        return pd.json_normalize(obj)
    elif isinstance(obj, dict):
        # flatten dict
        return pd.json_normalize(obj)
    return pd.DataFrame()

# load
df = read_json_any(INPUT_FILE)

# sample
n = min(SAMPLE_SIZE, len(df))
df_sample = df.sample(n=n, random_state=42)

# save as JSON Lines
df_sample.to_json(OUTPUT_FILE, orient="records", lines=True, force_ascii=False)

print(f"✅ Loaded {len(df)} rows, sampled {len(df_sample)} → saved to {OUTPUT_FILE}")


✅ Loaded 2085487 rows, sampled 150 → saved to google_reviews_sample_idaho.json


In [None]:
file_path = './data/Yelp-JSON/Yelp JSON/yelp_dataset/yelp_academic_dataset_review.json'

N = 2000  # how many reviews you want
sample = []

with open(file_path, 'r', encoding='utf-8') as f:
    for i, line in enumerate(f, start=1):
        if i <= N:
            sample.append(line)
        else:
            # Replace elements with decreasing probability
            j = random.randint(1, i)
            if j <= N:
                sample[j-1] = line

# Parse just the sampled JSON lines
yelp_sample = pd.read_json('\n'.join(sample), lines=True)

print(yelp_sample.shape)
print(yelp_sample.head())

(2000, 9)
                review_id                 user_id             business_id  \
0  BzjXbPZKWyR7vmiyoMRuTw  KJf7fWhW1KrroTTCAdxIWA  YMOCOlONOae4zaiKImTnTQ   
1  lPtf4MUENN8raxh8G36Z_Q  ZjCEtvnGO3Ae51ubrAxG7g  YNCUUR7GU5I14cle_2XRcA   
2  hwvXR5r2eDQF1pol9uqWvA  iPkdNziqnhNukjnmsrSzmg  F6HCLpKegKxE6g1jyUaqiQ   
3  vDXql54TVF0ItRd205CGtQ  6s-g2vFu12OemhiK3FJuOQ  WVtg_Jx7OfJRdbnKzT2p4w   
4  Ga-siZ7Sadr3CL4ihaRegg  sdpIz4-s15T239CZ4Bd6Ag  W4ZEKkva9HpAdZG88juwyQ   

   stars  useful  funny  cool  \
0      1       0      0     0   
1      5       2      0     0   
2      4       8      0     4   
3      4       5      2     7   
4      4       6      5     5   

                                                text                date  
0  Quite possibly the worst service@a sushi bar t... 2015-02-16 20:44:19  
1  Visited on 2 different occasions.  First time,... 2021-01-14 04:31:34  
2  Positives: \nExtremely friendly staff \nA Hub ... 2010-04-25 22:14:53  
3  This cozy corner coffee s

  yelp_sample = pd.read_json('\n'.join(sample), lines=True)


In [9]:
yelp_sample.to_json('./data/yelp_sample.json', orient='records', lines=True)