In [1]:
CITY_NAME = "Seattle"

In [3]:
# Download GeoDataFrame of all boba shops in a particular city using Yelp API.
# Reference: https://boba-nyc.datalife.nyc/data-wrangling.html
# Writes to "./data/<city>_boba_shops.csv"

# importing libraries
import os
import pandas as pd
import geopandas as gpd
import requests
import osmnx as ox

from dotenv import load_dotenv
load_dotenv()

CITY_NAME = os.environ["CITY_NAME"]
CITY_SLUG = os.environ["CITY_SLUG"]
print(f"City name: {CITY_NAME}")
print(f"City slug: {CITY_SLUG}")

API_KEY = os.environ.get("YELP_API_KEY")

City name: Cambridge, MA, USA
City slug: cambridge_ma


In [4]:
# our offset parameter - each page 50 rows
offset = 0
print("initial offset number: {}".format(offset))

lst = []

# loop through the api 20 times (limit is 1000 rows with each page includes 50 rows)
for i in range(20):    
    try:
        headers = {"Authorization": "Bearer {}".format(API_KEY)}
        search_api_url = "https://api.yelp.com/v3/businesses/search"
        params = {"term": "bubble tea", 
                  "categories": "bubbletea, boba",
                  "location": CITY_NAME,
                  "offset": offset,
                  "limit": 50}

        response = requests.get(search_api_url, 
                                headers=headers, 
                                params=params, 
                                timeout=10)

        # return a dictionary
        data_dict = response.json()
        
        # convert the business dictionary to a pandas dataframe and append to list
        data = pd.DataFrame(data_dict["businesses"])
        lst.append(data)
      
        # add 50 to the offset to access a new page
        offset += 50
        print("current offset number: {}".format(offset))
        
    except Exception as ex:
        print("exception: {}\nexit loop.".format(ex))
        break

initial offset number: 0
current offset number: 50
current offset number: 100
current offset number: 150
current offset number: 200
current offset number: 250
current offset number: 300
current offset number: 350
current offset number: 400
current offset number: 450
current offset number: 500
current offset number: 550
current offset number: 600
current offset number: 650
current offset number: 700
current offset number: 750
current offset number: 800
current offset number: 850
current offset number: 900
current offset number: 950
current offset number: 1000


In [5]:
# concatenate all pages to one dataframe and reset index
df = pd.concat(lst)
df = df.reset_index(drop=True)

# review shape of dataframe
rows, columns = df.shape
print()
print("query includes {:,} rows and {} columns.".format(rows, columns))
print("row id is unique: {}.".format(df["id"].is_unique))

# review if dataframe id is unique, if not drop duplicates
if df["id"].is_unique == False:
    duplicates = df.loc[df.duplicated(subset=["id"])]
    vals = list(duplicates.head()["name"].values)
    print("\nduplicates found: {}.".format(vals))
    
    df = df.drop_duplicates(subset=["id"]).reset_index(drop=True)
    print("dropping duplicates...")
    
    rows, columns = df.shape
    print("\nrow id is unique: {}.".format(df["id"].is_unique))
    print("query includes {:,} rows and {} columns.".format(rows, columns))


# explode coordinates to create an individual column
gdf = pd.concat([df, df["coordinates"].apply(pd.Series)], axis=1)

# retrieve lat, lon values and return a geodataframe
gdf = gpd.GeoDataFrame(gdf, crs=4326, geometry=gpd.points_from_xy(gdf.longitude, gdf.latitude))

gdf.head()


query includes 99 rows and 16 columns.
row id is unique: True.


Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,coordinates,transactions,location,phone,display_phone,distance,price,latitude,longitude,geometry
0,Yr1f5aq-4YEG3cZbGvD6vg,tiger-sugar-cambridge,Tiger Sugar,https://s3-media3.fl.yelpcdn.com/bphoto/oBCuUz...,False,https://www.yelp.com/biz/tiger-sugar-cambridge...,15,"[{'alias': 'bubbletea', 'title': 'Bubble Tea'}]",4.0,"{'latitude': 42.37323, 'longitude': -71.11973}",[],"{'address1': '5 Jfk St', 'address2': None, 'ad...",18572688786.0,(857) 268-8786,868.572415,,42.37323,-71.11973,POINT (-71.11973 42.37323)
1,o1PqQLZ4f2zKN3i3NcgH7A,onezo-cambridge,OneZo,https://s3-media3.fl.yelpcdn.com/bphoto/ahtLBF...,False,https://www.yelp.com/biz/onezo-cambridge?adjus...,16,"[{'alias': 'bubbletea', 'title': 'Bubble Tea'}...",3.5,"{'latitude': 42.38597, 'longitude': -71.11962}",[],"{'address1': '1792 Massachusetts Ave', 'addres...",,,1050.231272,,42.38597,-71.11962,POINT (-71.11962 42.38597)
2,cNNTPEqz5uMY5cjCW7M8oA,tea-dō-central-sq-cambridge-3,Tea-Dō Central Sq,https://s3-media1.fl.yelpcdn.com/bphoto/-ie9cX...,False,https://www.yelp.com/biz/tea-d%C5%8D-central-s...,7,"[{'alias': 'bubbletea', 'title': 'Bubble Tea'}...",5.0,"{'latitude': 42.363876, 'longitude': -71.100404}","[pickup, delivery]","{'address1': '425 Massachusetts Ave', 'address...",16177508564.0,(617) 750-8564,1871.351639,,42.363876,-71.100404,POINT (-71.10040 42.36388)
3,Li7MJ20quTnLRRmBbET_2A,little-bake-cambridge,Little Bake,https://s3-media3.fl.yelpcdn.com/bphoto/KZu-6R...,False,https://www.yelp.com/biz/little-bake-cambridge...,26,"[{'alias': 'bubbletea', 'title': 'Bubble Tea'}...",5.0,"{'latitude': 42.3894663, 'longitude': -71.1197...","[pickup, delivery]","{'address1': '1925 Massachusetts Ave', 'addres...",16177145243.0,(617) 714-5243,1389.237114,,42.389466,-71.119733,POINT (-71.11973 42.38947)
4,fuf2ZDzMRfu1Xx0WugenZw,gong-cha-cambridge-2,Gong Cha,https://s3-media3.fl.yelpcdn.com/bphoto/VGaBEj...,False,https://www.yelp.com/biz/gong-cha-cambridge-2?...,3,"[{'alias': 'bubbletea', 'title': 'Bubble Tea'}...",4.5,"{'latitude': 42.374141, 'longitude': -71.120671}","[pickup, delivery]","{'address1': '50 Church St', 'address2': '', '...",16175079258.0,(617) 507-9258,841.831661,,42.374141,-71.120671,POINT (-71.12067 42.37414)


In [7]:
# Clip data to the city
city_gdf = ox.geocode_to_gdf(CITY_NAME)
print(city_gdf["geometry"][0])

print(city_gdf["geometry"][0].bounds)

gdf = gpd.clip(gdf, city_gdf)
gdf = gdf.reset_index(drop=True)

rows, columns = gdf.shape
print("Final GeoDataFrame:")
print("number of rows: {}\n\
number of columns: {}".format(rows, columns))

POLYGON ((-71.1603989 42.386542, -71.15814 42.3830416, -71.1579707 42.3828054, -71.1578481 42.3825523, -71.1578097 42.3824544, -71.1578181 42.3823453, -71.1577691 42.3820902, -71.157634 42.3820985, -71.1573001 42.3819801, -71.1569605 42.3818528, -71.1565562 42.3817123, -71.1559114 42.3815581, -71.1556228 42.3814807, -71.1552005 42.381164, -71.1553339 42.3809085, -71.1550061 42.3803167, -71.1550506 42.3802, -71.1562312 42.3806362, -71.1543725 42.3750939, -71.150378 42.3746132, -71.1502391 42.3743771, -71.1471251 42.3749993, -71.1464549 42.3751243, -71.1459373 42.3751953, -71.1450913 42.3752756, -71.1444483 42.375284, -71.1436307 42.375277, -71.1439251 42.3740742, -71.1399918 42.3735687, -71.1400742 42.3726639, -71.1401128 42.3720059, -71.1401924 42.3715554, -71.1402789 42.3710996, -71.1403723 42.3706387, -71.1404507 42.3703592, -71.1405995 42.3700176, -71.1408111 42.3696815, -71.1410086 42.3693348, -71.141347 42.3688126, -71.1416219 42.3683833, -71.1418335 42.368042, -71.1421014 42.3676

In [8]:
# Find more accurate locations
from shapely.geometry import Point

for index, row in gdf.iterrows():
    loc = row["location"]
    addr = f"{loc['address1']}, {loc['city']}, {loc['zip_code']}"

    print(f"Attempting to geocode {addr:>50}... ", end="")
    try:
        lat, long = ox.geocoder.geocode(addr)
        print(f"Found: {lat, long}")

        gdf.loc[index,"geometry"] = Point(long, lat)

    except Exception as e:
        print(f"Not found. Dropping row.")
        gdf.drop(index=index, axis=0)

Attempting to geocode            425 Massachusetts Ave, Cambridge, 02139... Found: (42.3640794, -71.1014654)
Attempting to geocode             100 Cambridgeside Pl, Cambridge, 02141... Found: (42.3687832, -71.0759859)
Attempting to geocode             100 Cambridgeside Pl, Cambridge, 02141... Found: (42.3687832, -71.0759859)
Attempting to geocode            605 Massachusetts Ave, Cambridge, 02139... Found: (42.3651657, -71.1029784)
Attempting to geocode            955 Massachusetts Ave, Cambridge, 02139... Found: (42.3691092, -71.110072)
Attempting to geocode           1160 Massachusetts Ave, Cambridge, 02138... Found: (42.37119310917814, -71.11451245548736)
Attempting to geocode        54 John F. Kennedy Street, Cambridge, 02138... Found: (42.371938, -71.12058049999999)
Attempting to geocode                         5 Jfk St, Cambridge, 02138... Not found. Dropping row.
Attempting to geocode                     50 Church St, Cambridge, 02138... Found: (42.374218, -71.120629)
Attempting

In [10]:
# save file
save_path = f"./data/{CITY_SLUG}_boba_shops.csv"
gdf.to_csv(save_path, index=False)
print(f"Saved boba shops in {CITY_NAME} to {save_path}")

Saved boba shops in Cambridge, MA, USA to ./data/cambridge_ma_boba_shops.csv
