# Data Pipeline for South Dakota Reviews
This notebook processes and cleans review and business metadata for South Dakota.

In [None]:
# Import Libraries
import gzip, json
import pandas as pd

In [None]:
def parse(path):
    with gzip.open(path, 'rt', encoding='utf-8') as g:
        for line in g:
            yield json.loads(line)

## Load Data
Load review and business metadata from gzipped JSON files.

In [None]:
reviews_data = pd.read_json('review_South_Dakota.json.gz', lines=True, compression='gzip')
reviews_data['user_id'] = reviews_data['user_id'].apply(lambda x: str(int(x)) if isinstance(x, float) and not pd.isnull(x) else str(x))
biz_meta = pd.read_json('meta_South_Dakota.json.gz', lines=True, compression='gzip')

# standardize columns
biz_meta.columns = biz_meta.columns.str.lower().str.strip()
reviews_data.columns = reviews_data.columns.str.lower().str.strip()

In [None]:
print(reviews_data.isnull().sum())
print(biz_meta.isnull().sum())
print(biz_meta["state"].unique())
len(reviews_data)

In [None]:
print(reviews_data)
print(biz_meta)

## Data Cleaning
Clean review and business metadata.

In [None]:
# 1. cleaning of review data

# these columns are IMPT
reviews_data = reviews_data.dropna(subset=['rating', 'time', 'gmap_id', 'user_id'])

# Convert selected columns to string dtype
for col in ['user_name', 'user_id', 'text', 'gmap_id']:
    if col in reviews_data.columns:
        reviews_data[col] = reviews_data[col].astype(str)

reviews_data["time"] = pd.to_datetime(reviews_data["time"], unit="ms", utc=True)

is_dict = reviews_data["resp"].map(lambda x: isinstance(x, dict))
reviews_data["has_resp"]  = is_dict.fillna(False)
reviews_data["resp_text"] = reviews_data["resp"].where(is_dict).map(lambda d: d.get("text") if isinstance(d, dict) else None).astype("string")
reviews_data["resp_time"] = pd.to_datetime(
    reviews_data["resp"].where(is_dict).map(lambda d: d.get("time") if isinstance(d, dict) else None),
    unit="ms", utc=True, errors="coerce"
)

# Presence-only (True if not null, False if null)
reviews_data['pics'] = reviews_data['pics'].notna()

reviews_data = reviews_data.rename(columns={'name': 'user_name', 'time': 'review_time', 'text': 'review_text'})


In [None]:
# Check for user_id precision loss due to float conversion
float_ids = reviews_data['user_id'].apply(lambda x: isinstance(x, float) and ('.' in str(x) or 'e' in str(x)))
if float_ids.any():
    print('Warning: Some user_id values may have lost precision due to float conversion:')
    print(reviews_data[float_ids]['user_id'])
else:
    print('No user_id precision loss detected.')

In [None]:
reviews_data

In [None]:
print(reviews_data.dtypes)

In [None]:
# 2. cleaning of biz meta data
biz_meta = biz_meta.dropna(subset=['gmap_id'])

# Convert selected columns to string dtype
for col in ['name','description', 'category', 'user_id', 'text', 'gmap_id']:
    if col in reviews_data.columns:
        reviews_data[col] = reviews_data[col].astype(str)


# Convert $ → 1, $$ → 2, etc.
biz_meta['price_level'] = biz_meta['price'].str.len()
# Fill missing with 0 = unknown
biz_meta['price_level'] = biz_meta['price_level'].fillna(0).astype('int8')
biz_meta = biz_meta.rename(columns={'name': 'biz_name'})

## Data Merging
Merge relevant columns from business metadata into reviews.

In [None]:
keep_cols = [
    'gmap_id',        # join key
    'biz_name',
    'description',
    'category',
    'avg_rating',
    'num_of_reviews',
    'price_level'
]

keep_cols = [c for c in keep_cols if c in biz_meta.columns]
biz_meta = biz_meta[keep_cols].drop_duplicates(subset=['gmap_id'])

merged_reviews_data = reviews_data.merge(biz_meta, on='gmap_id', how='left')

merged_reviews_data

In [None]:
print(merged_reviews_data.dtypes)