# Data Pipeline for South Dakota Reviews

This notebook processes and cleans review and business metadata for South Dakota.


In [16]:
# Import Libraries
import gzip, json
import pandas as pd

In [17]:
def parse(path):
    with gzip.open(path, "rt", encoding="utf-8") as g:
        for line in g:
            yield json.loads(line)

## Load Data

Load review and business metadata from gzipped JSON files.


In [18]:
reviews_data = pd.read_json(
    "../data/review_South_Dakota.json.gz", lines=True, compression="gzip"
)
reviews_data["user_id"] = reviews_data["user_id"].apply(
    lambda x: str(int(x)) if isinstance(x, float) and not pd.isnull(x) else str(x)
)
biz_meta = pd.read_json(
    "../data/meta_South_Dakota.json.gz", lines=True, compression="gzip"
)

# standardize columns
biz_meta.columns = biz_meta.columns.str.lower().str.strip()
reviews_data.columns = reviews_data.columns.str.lower().str.strip()

In [19]:
print(reviews_data.isnull().sum())
print(biz_meta.isnull().sum())
print(biz_meta["state"].unique())
len(reviews_data)

user_id         0
name            0
time            0
rating          0
text       325966
pics       657233
resp       589840
gmap_id         0
dtype: int64
name                    0
address               153
gmap_id                 0
description         12149
latitude                0
longitude               0
category               62
avg_rating              0
num_of_reviews          0
price               12021
hours                4080
misc                 2740
state                4602
relative_results     1463
url                     0
dtype: int64
['Opens soon ⋅ 8AM' None 'Closed ⋅ Opens 8:30AM' 'Closed ⋅ Opens 9AM'
 'Open ⋅ Closes 11PM' 'Closed ⋅ Opens 9:30AM' 'Closed ⋅ Opens 10AM'
 'Open 24 hours' 'Closed ⋅ Opens 7AM Thu' 'Closed ⋅ Opens 8AM Thu'
 'Closed ⋅ Opens 10AM Thu' 'Closed ⋅ Opens 7:30AM Thu'
 'Closed ⋅ Opens 4PM' 'Open ⋅ Closes 5:30PM'
 'Closes soon ⋅ 4PM ⋅ Opens 10AM Thu' 'Open ⋅ Closes 5PM' 'Open now'
 'Open ⋅ Closes 4:30PM' 'Open ⋅ Closes 8PM' 'Open ⋅ Closes 4PM'
 '

673048

In [20]:
print(reviews_data)
print(biz_meta)

                      user_id               name           time  rating  \
0       103563353519118155776          Peri Gray  1516122675780       5   
1       101824980797027237888        Suzy Berndt  1532922350314       5   
2       108711640480272777216  Rosemary Red Legs  1530969093932       5   
3       101852294221648461824         Brown Wolf  1537085635922       2   
4       108987444312280645632      C J Blue Coat  1474922375491       5   
...                       ...                ...            ...     ...   
673043  106349677749985591296            Peggy M  1541553943539       5   
673044  111132439775161876480  Gabrielle Strouse  1465971016069       2   
673045  112599289864006172672   Where's The 10mm  1439839859315       5   
673046  104289624746481025024         A J Rausch  1534031130311       5   
673047  111259369481710452736                D B  1542421998790       5   

                                         text  pics  resp  \
0       Great place to care for our ch

## Data Cleaning

Clean review and business metadata.


In [21]:
# 1. cleaning of review data

# these columns are IMPT
reviews_data = reviews_data.dropna(subset=["rating", "time", "gmap_id", "user_id"])

# Convert selected columns to string dtype
for col in ["user_name", "user_id", "text", "gmap_id"]:
    if col in reviews_data.columns:
        reviews_data[col] = reviews_data[col].astype(str)

reviews_data["time"] = pd.to_datetime(reviews_data["time"], unit="ms", utc=True)

is_dict = reviews_data["resp"].map(lambda x: isinstance(x, dict))
reviews_data["has_resp"] = is_dict.fillna(False)
reviews_data["resp_text"] = (
    reviews_data["resp"]
    .where(is_dict)
    .map(lambda d: d.get("text") if isinstance(d, dict) else None)
    .astype("string")
)
reviews_data["resp_time"] = pd.to_datetime(
    reviews_data["resp"]
    .where(is_dict)
    .map(lambda d: d.get("time") if isinstance(d, dict) else None),
    unit="ms",
    utc=True,
    errors="coerce",
)

# Presence-only (True if not null, False if null)
reviews_data["pics"] = reviews_data["pics"].notna()

reviews_data = reviews_data.rename(
    columns={"name": "user_name", "time": "review_time", "text": "review_text"}
)

In [22]:
# Check for user_id precision loss due to float conversion
float_ids = reviews_data["user_id"].apply(
    lambda x: isinstance(x, float) and ("." in str(x) or "e" in str(x))
)
if float_ids.any():
    print(
        "Warning: Some user_id values may have lost precision due to float conversion:"
    )
    print(reviews_data[float_ids]["user_id"])
else:
    print("No user_id precision loss detected.")

No user_id precision loss detected.


In [23]:
reviews_data

Unnamed: 0,user_id,user_name,review_time,rating,review_text,pics,resp,gmap_id,has_resp,resp_text,resp_time
0,103563353519118155776,Peri Gray,2018-01-16 17:11:15.780000+00:00,5,Great place to care for our children.,False,,0x532af45db8f30779:0xd9be9359f1e56178,False,,NaT
1,101824980797027237888,Suzy Berndt,2018-07-30 03:45:50.314000+00:00,5,Th sw y are so nice,False,,0x532af45db8f30779:0xd9be9359f1e56178,False,,NaT
2,108711640480272777216,Rosemary Red Legs,2018-07-07 13:11:33.932000+00:00,5,Went with my daughter,False,,0x532af45db8f30779:0xd9be9359f1e56178,False,,NaT
3,101852294221648461824,Brown Wolf,2018-09-16 08:13:55.922000+00:00,2,,False,,0x532af45db8f30779:0xd9be9359f1e56178,False,,NaT
4,108987444312280645632,C J Blue Coat,2016-09-26 20:39:35.491000+00:00,5,,False,,0x532af45db8f30779:0xd9be9359f1e56178,False,,NaT
...,...,...,...,...,...,...,...,...,...,...,...
673043,106349677749985591296,Peggy M,2018-11-07 01:25:43.539000+00:00,5,,False,,0x877d4b3beecb809d:0xc8750c627d73a316,False,,NaT
673044,111132439775161876480,Gabrielle Strouse,2016-06-15 06:10:16.069000+00:00,2,,False,,0x877d4b3beecb809d:0xc8750c627d73a316,False,,NaT
673045,112599289864006172672,Where's The 10mm,2015-08-17 19:30:59.315000+00:00,5,,False,,0x877d4b3beecb809d:0xc8750c627d73a316,False,,NaT
673046,104289624746481025024,A J Rausch,2018-08-11 23:45:30.311000+00:00,5,,False,,0x877d4b3beecb809d:0xc8750c627d73a316,False,,NaT


In [24]:
print(reviews_data.dtypes)

user_id                     object
user_name                   object
review_time    datetime64[ns, UTC]
rating                       int64
review_text                 object
pics                          bool
resp                        object
gmap_id                     object
has_resp                      bool
resp_text           string[python]
resp_time      datetime64[ns, UTC]
dtype: object


In [25]:
# 2. cleaning of biz meta data
biz_meta = biz_meta.dropna(subset=["gmap_id"])

# Convert selected columns to string dtype
for col in ["name", "description", "category", "user_id", "text", "gmap_id"]:
    if col in reviews_data.columns:
        reviews_data[col] = reviews_data[col].astype(str)


# Convert $ → 1, $$ → 2, etc.
biz_meta["price_level"] = biz_meta["price"].str.len()
# Fill missing with 0 = unknown
biz_meta["price_level"] = biz_meta["price_level"].fillna(0).astype("int8")
biz_meta = biz_meta.rename(columns={"name": "biz_name"})

## Data Merging

Merge relevant columns from business metadata into reviews.


In [26]:
keep_cols = [
    "gmap_id",  # join key
    "biz_name",
    "description",
    "category",
    "avg_rating",
    "num_of_reviews",
    "price_level",
]

keep_cols = [c for c in keep_cols if c in biz_meta.columns]
biz_meta = biz_meta[keep_cols].drop_duplicates(subset=["gmap_id"])

merged_reviews_data = reviews_data.merge(biz_meta, on="gmap_id", how="left")

merged_reviews_data

Unnamed: 0,user_id,user_name,review_time,rating,review_text,pics,resp,gmap_id,has_resp,resp_text,resp_time,biz_name,description,category,avg_rating,num_of_reviews,price_level
0,103563353519118155776,Peri Gray,2018-01-16 17:11:15.780000+00:00,5,Great place to care for our children.,False,,0x532af45db8f30779:0xd9be9359f1e56178,False,,NaT,CRST WIC Office,,,4.7,8,0
1,101824980797027237888,Suzy Berndt,2018-07-30 03:45:50.314000+00:00,5,Th sw y are so nice,False,,0x532af45db8f30779:0xd9be9359f1e56178,False,,NaT,CRST WIC Office,,,4.7,8,0
2,108711640480272777216,Rosemary Red Legs,2018-07-07 13:11:33.932000+00:00,5,Went with my daughter,False,,0x532af45db8f30779:0xd9be9359f1e56178,False,,NaT,CRST WIC Office,,,4.7,8,0
3,101852294221648461824,Brown Wolf,2018-09-16 08:13:55.922000+00:00,2,,False,,0x532af45db8f30779:0xd9be9359f1e56178,False,,NaT,CRST WIC Office,,,4.7,8,0
4,108987444312280645632,C J Blue Coat,2016-09-26 20:39:35.491000+00:00,5,,False,,0x532af45db8f30779:0xd9be9359f1e56178,False,,NaT,CRST WIC Office,,,4.7,8,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
673043,106349677749985591296,Peggy M,2018-11-07 01:25:43.539000+00:00,5,,False,,0x877d4b3beecb809d:0xc8750c627d73a316,False,,NaT,Naughti Wines,,"[Winery, Brewery, Restaurant, Tourist attraction]",4.5,118,0
673044,111132439775161876480,Gabrielle Strouse,2016-06-15 06:10:16.069000+00:00,2,,False,,0x877d4b3beecb809d:0xc8750c627d73a316,False,,NaT,Naughti Wines,,"[Winery, Brewery, Restaurant, Tourist attraction]",4.5,118,0
673045,112599289864006172672,Where's The 10mm,2015-08-17 19:30:59.315000+00:00,5,,False,,0x877d4b3beecb809d:0xc8750c627d73a316,False,,NaT,Naughti Wines,,"[Winery, Brewery, Restaurant, Tourist attraction]",4.5,118,0
673046,104289624746481025024,A J Rausch,2018-08-11 23:45:30.311000+00:00,5,,False,,0x877d4b3beecb809d:0xc8750c627d73a316,False,,NaT,Naughti Wines,,"[Winery, Brewery, Restaurant, Tourist attraction]",4.5,118,0


In [27]:
print(merged_reviews_data.dtypes)

user_id                        object
user_name                      object
review_time       datetime64[ns, UTC]
rating                          int64
review_text                    object
pics                             bool
resp                           object
gmap_id                        object
has_resp                         bool
resp_text              string[python]
resp_time         datetime64[ns, UTC]
biz_name                       object
description                    object
category                       object
avg_rating                    float64
num_of_reviews                  int64
price_level                      int8
dtype: object


In [28]:
merged_reviews_data.to_csv("../data/cleaned_google_reviews.csv", index=False)