In [1]:
CITY_NAME = "Seattle"

In [2]:
# Download GeoDataFrame of all boba shops in a particular city using Yelp API.
# Reference: https://boba-nyc.datalife.nyc/data-wrangling.html
# Writes to "./data/<city>_boba_shops.csv"

# importing libraries
import os
import pandas as pd
import geopandas as gpd
import requests
import osmnx as ox

from dotenv import load_dotenv
load_dotenv()

True

In [18]:
# saving yelp api key as an environment variable
API_KEY = os.environ.get("yelp_api")

In [19]:
# our offset parameter - each page 50 rows
offset = 0
print("initial offset number: {}".format(offset))

lst = []

# loop through the api 20 times (limit is 1000 rows with each page includes 50 rows)
for i in range(20):    
    try:
        headers = {"Authorization": "Bearer {}".format(API_KEY)}
        search_api_url = "https://api.yelp.com/v3/businesses/search"
        params = {"term": "bubble tea", 
                  "categories": "bubbletea, boba",
                  "location": CITY_NAME,
                  "offset": offset,
                  "limit": 50}

        response = requests.get(search_api_url, 
                                headers=headers, 
                                params=params, 
                                timeout=10)

        # return a dictionary
        data_dict = response.json()
        
        # convert the business dictionary to a pandas dataframe and append to list
        data = pd.DataFrame(data_dict["businesses"])
        lst.append(data)
      
        # add 50 to the offset to access a new page
        offset += 50
        print("current offset number: {}".format(offset))
        
    except Exception as ex:
        print("exception: {}\nexit loop.".format(ex))
        break

initial offset number: 0
current offset number: 50
current offset number: 100
current offset number: 150
current offset number: 200
current offset number: 250
current offset number: 300
current offset number: 350
current offset number: 400
current offset number: 450
current offset number: 500
current offset number: 550
current offset number: 600
current offset number: 650
current offset number: 700
current offset number: 750
current offset number: 800
current offset number: 850
current offset number: 900
current offset number: 950
current offset number: 1000


In [20]:
# concatenate all pages to one dataframe and reset index
df = pd.concat(lst)
df = df.reset_index(drop=True)

# review shape of dataframe
rows, columns = df.shape
print()
print("query includes {:,} rows and {} columns.".format(rows, columns))
print("row id is unique: {}.".format(df["id"].is_unique))

# review if dataframe id is unique, if not drop duplicates
if df["id"].is_unique == False:
    duplicates = df.loc[df.duplicated(subset=["id"])]
    vals = list(duplicates.head()["name"].values)
    print("\nduplicates found: {}.".format(vals))
    
    df = df.drop_duplicates(subset=["id"]).reset_index(drop=True)
    print("dropping duplicates...")
    
    rows, columns = df.shape
    print("\nrow id is unique: {}.".format(df["id"].is_unique))
    print("query includes {:,} rows and {} columns.".format(rows, columns))


# explode coordinates to create an individual column
gdf = pd.concat([df, df["coordinates"].apply(pd.Series)], axis=1)

# retrieve lat, lon values and return a geodataframe
gdf = gpd.GeoDataFrame(gdf, crs=4326, geometry=gpd.points_from_xy(gdf.longitude, gdf.latitude))

print(gdf.head())


query includes 233 rows and 16 columns.
row id is unique: True.
                       id                     alias              name  \
0  Hy2xR9KLqu9yzLgk8HD4Ww  chicha-san-chen-bellevue   CHICHA San Chen   
1  gp8frponQ2ok2DF7H_aqkg      xing-fu-tang-seattle      Xing Fu Tang   
2  nDvwrhWdB7kjIwn2Z83lSQ  seattle-best-tea-seattle  Seattle Best Tea   
3  gDNr3c5tsaeWXVDkKRrm2A   dont-yell-at-me-seattle   Dont Yell At Me   
4  AZWPfcWcF8utdCfyTE48jA          drip-tea-seattle          DRIP TEA   

                                           image_url  is_closed  \
0  https://s3-media1.fl.yelpcdn.com/bphoto/IDWbst...      False   
1  https://s3-media2.fl.yelpcdn.com/bphoto/zWO33c...      False   
2  https://s3-media3.fl.yelpcdn.com/bphoto/uxsrWj...      False   
3  https://s3-media1.fl.yelpcdn.com/bphoto/7z-feo...      False   
4  https://s3-media1.fl.yelpcdn.com/bphoto/9YrXO9...      False   

                                                 url  review_count  \
0  https://www.yelp.com

In [21]:
# Clip data to the city
city_gdf = ox.geocode_to_gdf({ "city": CITY_NAME })
print(city_gdf["geometry"][0])

print(city_gdf["geometry"][0].bounds)

gdf = gpd.clip(gdf, city_gdf)
gdf = gdf.reset_index(drop=True)

rows, columns = gdf.shape
print("Final GeoDataFrame:")
print("number of rows: {}\n\
number of columns: {}".format(rows, columns))

POLYGON ((-122.459696 47.674269, -122.45962 47.672697, -122.45962 47.671118, -122.459559 47.669539, -122.459437 47.668478, -122.45904 47.666472, -122.458582 47.664381, -122.458246 47.663343, -122.457804 47.66197, -122.457087 47.660009, -122.456446 47.658415, -122.455927 47.657148, -122.455469 47.655836, -122.455012 47.654608, -122.45463 47.653494, -122.454447 47.652655, -122.4540965 47.6521951, -122.453974 47.650824, -122.453592 47.649237, -122.453257 47.647314, -122.453103 47.646268, -122.452997 47.645552, -122.452813 47.644018, -122.452615 47.64208, -122.452554 47.639792, -122.452554 47.638765, -122.452554 47.638113, -122.452737 47.636351, -122.452935 47.634916, -122.453195 47.633673, -122.453454 47.632628, -122.453912 47.631392, -122.454171 47.630247, -122.454346 47.629627, -122.454492 47.62911, -122.45469 47.627966, -122.454751 47.627089, -122.454751 47.626204, -122.45289 47.622656, -122.449533 47.617646, -122.447743 47.614765, -122.446801 47.598135, -122.4477 47.595562, -122.45016

In [28]:
# Find more accurate locations
from shapely.geometry import Point

for index, row in gdf.iterrows():
    loc = row["location"]
    addr = f"{loc['address1']}, {loc['city']}, {loc['zip_code']}"

    print(f"Attempting to geocode {addr:>50}... ", end="")
    try:
        lat, long = ox.geocoder.geocode(addr)
        print(f"Found: {lat, long}")

        gdf.loc[index,"geometry"] = Point(long, lat)

    except Exception as e:
        print(f"Not found. Dropping row.")
        gdf.drop(index=index, axis=0)

Attempting to geocode                      600 5th Ave S, Seattle, 98104... Found: (47.5972782, -122.327447)
Attempting to geocode                      519 6th Ave S, Seattle, 98104... Found: (47.5977153, -122.3265439)
Attempting to geocode                    618 S Weller St, Seattle, 98104... Found: (47.5976337, -122.3255237)
Attempting to geocode                      619 S King St, Seattle, 98104... Found: (47.5982488, -122.3254928)
Attempting to geocode                      679 S King St, Seattle, 98104... Found: (47.5982404, -122.3239502)
Attempting to geocode                  1043 S Jackson St, Seattle, 98104... Found: (47.5988089, -122.317458)
Attempting to geocode                        613 9th Ave, Seattle, 98104... Found: (47.6059309, -122.3253983)
Attempting to geocode                      1150 11th Ave, Seattle, 98122... Found: (47.6126016, -122.317964)
Attempting to geocode                      1416 10th Ave, Seattle, 98122... Found: (47.6133604, -122.3192083)
Attempting to

In [29]:
# save file
save_path = f"./data/{CITY_NAME.lower()}_boba_shops.csv"
gdf.to_csv(save_path, index=False)
print(f"Saved boba shops in {CITY_NAME} to {save_path}")

Saved boba shops in Seattle to ./data/seattle_boba_shops.csv
