### Libraries

In [1]:
import random
import json
import gc
import os
import pandas as pd

### Load and merge raw files

In [4]:
# Reviews json file
review_raw = "data/review-California.json"
review_filtered = "sampled_data/review_California_filtered.json"

# Filter review data for relevant columns
review_cols = ['user_id', 'time', 'rating', 'text', 'gmap_id']
gmap_ids = set()

with open(review_raw, "r", encoding="utf-8") as fin, \
     open(review_filtered, "w", encoding="utf-8") as fout:
    
    for line in fin:
        obj = json.loads(line)
        # Keep only relevant columns
        filtered_obj = {k: obj.get(k) for k in review_cols}
        fout.write(json.dumps(filtered_obj) + "\n")
        gmap_ids.add(obj["gmap_id"])
        del obj, filtered_obj  # Clear memory

print("Review file processed.")
print(f"Unique gmap_ids: {len(gmap_ids)}")
print()

# Meta json file
meta_raw = "data/meta-California.json"
meta_filtered = "sampled_data/meta_California_filtered.json"

# Filter meta data for relevant columns
meta_cols = ['gmap_id', 'name', 'category', 'avg_rating', 'num_of_reviews']

with open(meta_raw, "r", encoding="utf-8") as fin, \
     open(meta_filtered, "w", encoding="utf-8") as fout:
    
    for line in fin:
        obj = json.loads(line)
        if obj["gmap_id"] in gmap_ids:
            # Keep only relevant columns
            filtered_obj = {k: obj.get(k) for k in meta_cols}
            fout.write(json.dumps(filtered_obj) + "\n")
            del obj, filtered_obj  # Clear memory

# Clear gmap_ids set
del gmap_ids
gc.collect()

print("Meta file processed.")
print()

# Merge Review & Meta
reviews = pd.read_json(review_filtered, lines=True)
meta = pd.read_json(meta_filtered, lines=True)

# Merge on gmap_id
df = reviews.merge(meta, on='gmap_id', how='left', suffixes=('_review', '_business'))

# Clear intermediate dataframes
del reviews, meta
gc.collect()

# Rename 'name' to 'name_business' for consistency
if 'name' in df.columns:
    df.rename(columns={'name': 'name_business'}, inplace=True)

print("*Combined Dataset*")
print(f"Total size: {df.shape}")
print(f"Total reviews: {len(df)}")
print()

# Save in chunks to avoid memory crash
combined_sample = "data/combined_California_filtered.json"
chunk_size = 1_000_000  # 1M rows at a time

print("Saving in chunks...")
for i in range(0, len(df), chunk_size):
    chunk = df.iloc[i:i+chunk_size]
    mode = 'w' if i == 0 else 'a'  # Write first chunk, append rest
    chunk.to_json(combined_sample, orient='records', lines=True, mode=mode)
    print(f"Saved rows {i:,} to {min(i+chunk_size, len(df)):,}")
    del chunk
    gc.collect()

print(f"Combined df saved as json.")

# Clear df after saving
del df
gc.collect()

Review file processed.
Unique gmap_ids: 513134

Meta file processed.

*Combined Dataset*
Total size: (70632313, 9)
Total reviews: 70632313

Saving in chunks...
Saved rows 0 to 1,000,000
Saved rows 1,000,000 to 2,000,000
Saved rows 2,000,000 to 3,000,000
Saved rows 3,000,000 to 4,000,000
Saved rows 4,000,000 to 5,000,000
Saved rows 5,000,000 to 6,000,000
Saved rows 6,000,000 to 7,000,000
Saved rows 7,000,000 to 8,000,000
Saved rows 8,000,000 to 9,000,000
Saved rows 9,000,000 to 10,000,000
Saved rows 10,000,000 to 11,000,000
Saved rows 11,000,000 to 12,000,000
Saved rows 12,000,000 to 13,000,000
Saved rows 13,000,000 to 14,000,000
Saved rows 14,000,000 to 15,000,000
Saved rows 15,000,000 to 16,000,000
Saved rows 16,000,000 to 17,000,000
Saved rows 17,000,000 to 18,000,000
Saved rows 18,000,000 to 19,000,000
Saved rows 19,000,000 to 20,000,000
Saved rows 20,000,000 to 21,000,000
Saved rows 21,000,000 to 22,000,000
Saved rows 22,000,000 to 23,000,000
Saved rows 23,000,000 to 24,000,000
Sav

0

#### This json file was still 25GB once saved. 

### Load merged data

##### * Future processing start from here *

In [2]:
# LOAD MERGED DATASET

chunk_size = 1_000_000  # 1M rows at a time
chunks = []

print("Reading in chunks...")
for i, chunk in enumerate(pd.read_json('data/combined_California_filtered.json', 
                                        lines=True, 
                                        chunksize=chunk_size)):
    chunks.append(chunk)
    if (i + 1) % 10 == 0:  # Print every 10 chunks
        print(f"Loaded chunk {i+1}: {len(chunk):,} rows")

# Combine chunks
df = pd.concat(chunks, ignore_index=True)
del chunks
gc.collect()

print(f"\nTotal merged rows: {len(df):,}")

Reading in chunks...
Loaded chunk 10: 1,000,000 rows
Loaded chunk 20: 1,000,000 rows
Loaded chunk 30: 1,000,000 rows
Loaded chunk 40: 1,000,000 rows
Loaded chunk 50: 1,000,000 rows
Loaded chunk 60: 1,000,000 rows
Loaded chunk 70: 1,000,000 rows

Total merged rows: 70,632,313


### Clean data

In [3]:
# Duplicate check and delete
print("*Duplicate Review Check*")
print(f"Total rows before: {len(df):,}")
duplicate_mask = df.duplicated(subset=['user_id', 'gmap_id', 'time'], keep='first') # If user, business and time match, consider that a duplicate
print(f"Duplicate reviews found: {duplicate_mask.sum():,}")
df = df[~duplicate_mask].copy()
print(f"Total rows after: {len(df):,}")
del duplicate_mask
gc.collect()

# Remove ratings with NaN
df = df.dropna(subset=['rating'])
print(f"Rows after removing NaN ratings: {len(df):,}")
print()

# Remove reviews with NaN or missing text
df = df[df['text'].notna() & (df['text'].str.strip() != '')]
print(f"Rows after removing reviews with empty text: {len(df):,}")
print()

df.info()

*Duplicate Review Check*
Total rows before: 70,632,313
Duplicate reviews found: 976,178
Total rows after: 69,656,135
Rows after removing NaN ratings: 69,286,010

Rows after removing reviews with empty text: 38,074,228

<class 'pandas.DataFrame'>
Index: 38074228 entries, 0 to 70632058
Data columns (total 9 columns):
 #   Column          Dtype  
---  ------          -----  
 0   user_id         float64
 1   time            int64  
 2   rating          float64
 3   text            str    
 4   gmap_id         str    
 5   name_business   str    
 6   category        object 
 7   avg_rating      float64
 8   num_of_reviews  int64  
dtypes: float64(3), int64(2), object(1), str(3)
memory usage: 10.1+ GB


In [4]:
# Temporary saving point for a manageable size file for eda
df.to_csv('data/cleaned_for_eda.csv', index=False)
print("Saved to csv.")

Saved to csv.


## More possible filters for modeling  
- businesses with more than 10 reviews, user_id with 5 or more reviews (businesses with more than 10 reviews and reviewers with more than 5 reviews are likely to yield more consistent results, less outliers)