# 0. Import Libraries & Define Globals


In [158]:
# ---------
# importing
# ---------

import gzip
import json
import pandas as pd
import numpy as np

In [159]:
# -----------------
# global functions
# -----------------
def parse(path):
    g = gzip.open(path, "r")
    for l in g:
        yield json.loads(l)

In [160]:
# ---------
# constants
# ---------
FILE_PATH_1 = "data_sources/review-South_Dakota_10.json.gz"
FILE_PATH_METADATA = "data_sources/meta-South_Dakota.json.gz"

# 1. Read Data Sources


In [161]:
# -----------------
# read from kaggle
# -----------------
kaggle_df = pd.read_csv("data_sources/reviews.csv")
kaggle_rename_map = {"author_name": "reviewer_name", "rating_category": "category"}
kaggle_df["rating_category"] = "food"
kaggle_df = kaggle_df.rename(columns=kaggle_rename_map)
kaggle_df = kaggle_df.drop(columns=["photo"])


kaggle_df.head()

Unnamed: 0,business_name,reviewer_name,text,rating,category
0,Haci'nin Yeri - Yigit Lokantasi,Gulsum Akar,We went to Marmaris with my wife for a holiday...,5,food
1,Haci'nin Yeri - Yigit Lokantasi,Oguzhan Cetin,During my holiday in Marmaris we ate here to f...,4,food
2,Haci'nin Yeri - Yigit Lokantasi,Yasin Kuyu,Prices are very affordable. The menu in the ph...,3,food
3,Haci'nin Yeri - Yigit Lokantasi,Orhan Kapu,Turkey's cheapest artisan restaurant and its f...,5,food
4,Haci'nin Yeri - Yigit Lokantasi,Ozgur Sati,I don't know what you will look for in terms o...,3,food


In [None]:
# -- populate avg rating -> num of reviews in kaggle) 
avg_ratings_business = kaggle_df.groupby("business_name")["rating"].mean()
kaggle_df["avg_rating"] = kaggle_df["business_name"].map(avg_ratings_business)
num_reviews_business = kaggle_df.groupby("business_name")["text"].count()
kaggle_df["num_of_reviews"] = kaggle_df["business_name"].map(num_reviews_business)
kaggle_df.head()

Unnamed: 0,business_name,reviewer_name,text,rating,category,avg_rating,num_of_reviews
0,Haci'nin Yeri - Yigit Lokantasi,Gulsum Akar,We went to Marmaris with my wife for a holiday...,5,food,4.454545,11
1,Haci'nin Yeri - Yigit Lokantasi,Oguzhan Cetin,During my holiday in Marmaris we ate here to f...,4,food,4.454545,11
2,Haci'nin Yeri - Yigit Lokantasi,Yasin Kuyu,Prices are very affordable. The menu in the ph...,3,food,4.454545,11
3,Haci'nin Yeri - Yigit Lokantasi,Orhan Kapu,Turkey's cheapest artisan restaurant and its f...,5,food,4.454545,11
4,Haci'nin Yeri - Yigit Lokantasi,Ozgur Sati,I don't know what you will look for in terms o...,3,food,4.454545,11


In [163]:
kaggle_df.shape

(1100, 7)

In [164]:
# -----------------
# read south dakota
# -----------------

records = list(parse(FILE_PATH_1))
df = pd.DataFrame(records)

meta_data_records = list(parse(FILE_PATH_METADATA))
metadata_df = pd.DataFrame(meta_data_records)


google_location_df = pd.merge(
    df, metadata_df, how="left", on=["gmap_id"], suffixes=["_x", "_y"]
)

rename_map = {
    "name_x": "reviewer_name",
    "name_y": "business_name",
}

google_location_df = google_location_df.rename(columns=rename_map)
google_location_df = google_location_df.drop(
    columns=[
        "user_id",
        "address",
        "url",
        "relative_results",
        "MISC",
        "hours",
        "gmap_id",
        "avg_rating",
    ]
)


google_location_df.head()

Unnamed: 0,reviewer_name,time,rating,text,pics,resp,business_name,description,latitude,longitude,category,num_of_reviews,price,state
0,Peri Gray,1516122675780,5,Great place to care for our children.,,,CRST WIC Office,,44.990878,-101.239919,,8,,
1,Peri Gray,1516122675780,5,Great place to care for our children.,,,CRST WIC Office,,44.990878,-101.239919,,8,,
2,Suzy Berndt,1532922350314,5,Th sw y are so nice,,,CRST WIC Office,,44.990878,-101.239919,,8,,
3,Suzy Berndt,1532922350314,5,Th sw y are so nice,,,CRST WIC Office,,44.990878,-101.239919,,8,,
4,Rosemary Red Legs,1530969093932,5,Went with my daughter,,,CRST WIC Office,,44.990878,-101.239919,,8,,


In [165]:
google_location_df.shape

(673628, 14)

In [166]:
# Add missing columns from google_location_df to kaggle_df with NaN values
for col in google_location_df.columns:
    if col not in kaggle_df.columns:
        kaggle_df[col] = None

# Reorder kaggle_df columns to match google_location_df columns
kaggle_df = kaggle_df[google_location_df.columns]

# 2. Finalised Schema


In [167]:
# Concatenate the two dataframes (union)
final_df = pd.concat([google_location_df, kaggle_df], ignore_index=True)
final_df["review_id"] = final_df.index  # Use index as a unique ID for reviews
reviewer_name_to_id = {
    name: idx for idx, name in enumerate(final_df["reviewer_name"].unique())
}
final_df["reviewer_id"] = final_df["reviewer_name"].map(reviewer_name_to_id)

  final_df = pd.concat([google_location_df, kaggle_df], ignore_index=True)


In [168]:
final_df.head()

Unnamed: 0,reviewer_name,time,rating,text,pics,resp,business_name,description,latitude,longitude,category,num_of_reviews,price,state,review_id,reviewer_id
0,Peri Gray,1516122675780,5,Great place to care for our children.,,,CRST WIC Office,,44.990878,-101.239919,,8,,,0,0
1,Peri Gray,1516122675780,5,Great place to care for our children.,,,CRST WIC Office,,44.990878,-101.239919,,8,,,1,0
2,Suzy Berndt,1532922350314,5,Th sw y are so nice,,,CRST WIC Office,,44.990878,-101.239919,,8,,,2,1
3,Suzy Berndt,1532922350314,5,Th sw y are so nice,,,CRST WIC Office,,44.990878,-101.239919,,8,,,3,1
4,Rosemary Red Legs,1530969093932,5,Went with my daughter,,,CRST WIC Office,,44.990878,-101.239919,,8,,,4,2


In [169]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 674728 entries, 0 to 674727
Data columns (total 16 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   reviewer_name   674728 non-null  object 
 1   time            673628 non-null  object 
 2   rating          674728 non-null  int64  
 3   text            348468 non-null  object 
 4   pics            15825 non-null   object 
 5   resp            83260 non-null   object 
 6   business_name   674728 non-null  object 
 7   description     369935 non-null  object 
 8   latitude        673628 non-null  float64
 9   longitude       673628 non-null  float64
 10  category        674559 non-null  object 
 11  num_of_reviews  674728 non-null  int64  
 12  price           362462 non-null  object 
 13  state           412801 non-null  object 
 14  review_id       674728 non-null  int64  
 15  reviewer_id     674728 non-null  int64  
dtypes: float64(2), int64(4), object(10)
memory usage: 82.4+ 

In [170]:
final_df["reviewer_name"].nunique()

27129